tokenize.c 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. /*
  2. * This file defines the string_tokenize interface
  3. * Time-stamp: "2007-11-12 20:40:36 bkorb"
  4. *
  5. * This file is part of AutoOpts, a companion to AutoGen.
  6. * AutoOpts is free software.
  7. * AutoOpts is copyright (c) 1992-2008 by Bruce Korb - all rights reserved
  8. * AutoOpts is copyright (c) 1992-2008 by Bruce Korb - all rights reserved
  9. *
  10. * AutoOpts is available under any one of two licenses. The license
  11. * in use must be one of these two and the choice is under the control
  12. * of the user of the license.
  13. *
  14. * The GNU Lesser General Public License, version 3 or later
  15. * See the files "COPYING.lgplv3" and "COPYING.gplv3"
  16. *
  17. * The Modified Berkeley Software Distribution License
  18. * See the file "COPYING.mbsd"
  19. *
  20. * These files have the following md5sums:
  21. *
  22. * 239588c55c22c60ffe159946a760a33e pkg/libopts/COPYING.gplv3
  23. * fa82ca978890795162346e661b47161a pkg/libopts/COPYING.lgplv3
  24. * 66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd
  25. */
  26. #include <errno.h>
  27. #include <stdlib.h>
  28. #define cc_t const unsigned char
  29. #define ch_t unsigned char
  30. /* = = = START-STATIC-FORWARD = = = */
  31. /* static forward declarations maintained by mk-fwd */
  32. static void
  33. copy_cooked( ch_t** ppDest, char const ** ppSrc );
  34. static void
  35. copy_raw( ch_t** ppDest, char const ** ppSrc );
  36. /* = = = END-STATIC-FORWARD = = = */
  37. static void
  38. copy_cooked( ch_t** ppDest, char const ** ppSrc )
  39. {
  40. ch_t* pDest = (ch_t*)*ppDest;
  41. const ch_t* pSrc = (const ch_t*)(*ppSrc + 1);
  42. for (;;) {
  43. ch_t ch = *(pSrc++);
  44. switch (ch) {
  45. case NUL: *ppSrc = NULL; return;
  46. case '"': goto done;
  47. case '\\':
  48. pSrc += ao_string_cook_escape_char( (char*)pSrc, (char*)&ch, 0x7F );
  49. if (ch == 0x7F)
  50. break;
  51. /* FALLTHROUGH */
  52. default:
  53. *(pDest++) = ch;
  54. }
  55. }
  56. done:
  57. *ppDest = (ch_t*)pDest; /* next spot for storing character */
  58. *ppSrc = (char const *)pSrc; /* char following closing quote */
  59. }
  60. static void
  61. copy_raw( ch_t** ppDest, char const ** ppSrc )
  62. {
  63. ch_t* pDest = *ppDest;
  64. cc_t* pSrc = (cc_t*) (*ppSrc + 1);
  65. for (;;) {
  66. ch_t ch = *(pSrc++);
  67. switch (ch) {
  68. case NUL: *ppSrc = NULL; return;
  69. case '\'': goto done;
  70. case '\\':
  71. /*
  72. * *Four* escapes are handled: newline removal, escape char
  73. * quoting and apostrophe quoting
  74. */
  75. switch (*pSrc) {
  76. case NUL: *ppSrc = NULL; return;
  77. case '\r':
  78. if (*(++pSrc) == '\n')
  79. ++pSrc;
  80. continue;
  81. case '\n':
  82. ++pSrc;
  83. continue;
  84. case '\'':
  85. ch = '\'';
  86. /* FALLTHROUGH */
  87. case '\\':
  88. ++pSrc;
  89. break;
  90. }
  91. /* FALLTHROUGH */
  92. default:
  93. *(pDest++) = ch;
  94. }
  95. }
  96. done:
  97. *ppDest = pDest; /* next spot for storing character */
  98. *ppSrc = (char const *) pSrc; /* char following closing quote */
  99. }
  100. /*=export_func ao_string_tokenize
  101. *
  102. * what: tokenize an input string
  103. *
  104. * arg: + char const* + string + string to be tokenized +
  105. *
  106. * ret_type: token_list_t*
  107. * ret_desc: pointer to a structure that lists each token
  108. *
  109. * doc:
  110. *
  111. * This function will convert one input string into a list of strings.
  112. * The list of strings is derived by separating the input based on
  113. * white space separation. However, if the input contains either single
  114. * or double quote characters, then the text after that character up to
  115. * a matching quote will become the string in the list.
  116. *
  117. * The returned pointer should be deallocated with @code{free(3C)} when
  118. * are done using the data. The data are placed in a single block of
  119. * allocated memory. Do not deallocate individual token/strings.
  120. *
  121. * The structure pointed to will contain at least these two fields:
  122. * @table @samp
  123. * @item tkn_ct
  124. * The number of tokens found in the input string.
  125. * @item tok_list
  126. * An array of @code{tkn_ct + 1} pointers to substring tokens, with
  127. * the last pointer set to NULL.
  128. * @end table
  129. *
  130. * There are two types of quoted strings: single quoted (@code{'}) and
  131. * double quoted (@code{"}). Singly quoted strings are fairly raw in that
  132. * escape characters (@code{\\}) are simply another character, except when
  133. * preceding the following characters:
  134. * @example
  135. * @code{\\} double backslashes reduce to one
  136. * @code{'} incorporates the single quote into the string
  137. * @code{\n} suppresses both the backslash and newline character
  138. * @end example
  139. *
  140. * Double quote strings are formed according to the rules of string
  141. * constants in ANSI-C programs.
  142. *
  143. * example:
  144. * @example
  145. * #include <stdlib.h>
  146. * int ix;
  147. * token_list_t* ptl = ao_string_tokenize( some_string )
  148. * for (ix = 0; ix < ptl->tkn_ct; ix++)
  149. * do_something_with_tkn( ptl->tkn_list[ix] );
  150. * free( ptl );
  151. * @end example
  152. * Note that everything is freed with the one call to @code{free(3C)}.
  153. *
  154. * err:
  155. * NULL is returned and @code{errno} will be set to indicate the problem:
  156. * @itemize @bullet
  157. * @item
  158. * @code{EINVAL} - There was an unterminated quoted string.
  159. * @item
  160. * @code{ENOENT} - The input string was empty.
  161. * @item
  162. * @code{ENOMEM} - There is not enough memory.
  163. * @end itemize
  164. =*/
  165. token_list_t*
  166. ao_string_tokenize( char const* str )
  167. {
  168. int max_token_ct = 1; /* allow for trailing NUL on string */
  169. token_list_t* res;
  170. if (str == NULL) goto bogus_str;
  171. /*
  172. * Trim leading white space. Use "ENOENT" and a NULL return to indicate
  173. * an empty string was passed.
  174. */
  175. while (IS_WHITESPACE_CHAR(*str)) str++;
  176. if (*str == NUL) {
  177. bogus_str:
  178. errno = ENOENT;
  179. return NULL;
  180. }
  181. /*
  182. * Take an approximate count of tokens. If no quoted strings are used,
  183. * it will be accurate. If quoted strings are used, it will be a little
  184. * high and we'll squander the space for a few extra pointers.
  185. */
  186. {
  187. cc_t* pz = (cc_t*)str;
  188. do {
  189. max_token_ct++;
  190. while (! IS_WHITESPACE_CHAR(*++pz))
  191. if (*pz == NUL) goto found_nul;
  192. while (IS_WHITESPACE_CHAR(*pz)) pz++;
  193. } while (*pz != NUL);
  194. found_nul:
  195. ;
  196. }
  197. res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) );
  198. if (res == NULL) {
  199. errno = ENOMEM;
  200. return res;
  201. }
  202. /*
  203. * Now copy each token into the output buffer.
  204. */
  205. {
  206. ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1));
  207. res->tkn_ct = 0;
  208. do {
  209. res->tkn_list[ res->tkn_ct++ ] = pzDest;
  210. for (;;) {
  211. int ch = (ch_t)*str;
  212. if (IS_WHITESPACE_CHAR(ch)) {
  213. found_white_space:
  214. while (IS_WHITESPACE_CHAR(*++str)) ;
  215. break;
  216. }
  217. switch (ch) {
  218. case '"':
  219. copy_cooked( &pzDest, &str );
  220. if (str == NULL) {
  221. free(res);
  222. errno = EINVAL;
  223. return NULL;
  224. }
  225. if (IS_WHITESPACE_CHAR(*str))
  226. goto found_white_space;
  227. break;
  228. case '\'':
  229. copy_raw( &pzDest, &str );
  230. if (str == NULL) {
  231. free(res);
  232. errno = EINVAL;
  233. return NULL;
  234. }
  235. if (IS_WHITESPACE_CHAR(*str))
  236. goto found_white_space;
  237. break;
  238. case NUL:
  239. goto copy_done;
  240. default:
  241. str++;
  242. *(pzDest++) = ch;
  243. }
  244. } copy_done:;
  245. /*
  246. * NUL terminate the last token and see if we have any more tokens.
  247. */
  248. *(pzDest++) = NUL;
  249. } while (*str != NUL);
  250. res->tkn_list[ res->tkn_ct ] = NULL;
  251. }
  252. return res;
  253. }
  254. #ifdef TEST
  255. #include <stdio.h>
  256. #include <string.h>
  257. int
  258. main( int argc, char** argv )
  259. {
  260. if (argc == 1) {
  261. printf("USAGE: %s arg [ ... ]\n", *argv);
  262. return 1;
  263. }
  264. while (--argc > 0) {
  265. char* arg = *(++argv);
  266. token_list_t* p = ao_string_tokenize( arg );
  267. if (p == NULL) {
  268. printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
  269. arg, errno, strerror( errno ));
  270. } else {
  271. int ix = 0;
  272. printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct );
  273. do {
  274. printf( " %3d: ``%s''\n", ix+1, p->tkn_list[ix] );
  275. } while (++ix < p->tkn_ct);
  276. free(p);
  277. }
  278. }
  279. return 0;
  280. }
  281. #endif
  282. /*
  283. * Local Variables:
  284. * mode: C
  285. * c-file-style: "stroustrup"
  286. * indent-tabs-mode: nil
  287. * End:
  288. * end of autoopts/tokenize.c */