tokenize.c 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. /** \file tokenize.c
  2. *
  3. * Tokenize a string, accommodating quoted strings.
  4. *
  5. * @addtogroup autoopts
  6. * @{
  7. */
  8. /*
  9. * This file defines the string_tokenize interface
  10. * This file is part of AutoOpts, a companion to AutoGen.
  11. * AutoOpts is free software.
  12. * AutoOpts is Copyright (C) 1992-2014 by Bruce Korb - all rights reserved
  13. *
  14. * AutoOpts is available under any one of two licenses. The license
  15. * in use must be one of these two and the choice is under the control
  16. * of the user of the license.
  17. *
  18. * The GNU Lesser General Public License, version 3 or later
  19. * See the files "COPYING.lgplv3" and "COPYING.gplv3"
  20. *
  21. * The Modified Berkeley Software Distribution License
  22. * See the file "COPYING.mbsd"
  23. *
  24. * These files have the following sha256 sums:
  25. *
  26. * 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3
  27. * 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3
  28. * 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd
  29. */
  30. #include <errno.h>
  31. #include <stdlib.h>
  32. #define cc_t const unsigned char
  33. #define ch_t unsigned char
  34. /* = = = START-STATIC-FORWARD = = = */
  35. static void
  36. copy_cooked(ch_t** ppDest, char const ** ppSrc);
  37. static void
  38. copy_raw(ch_t** ppDest, char const ** ppSrc);
  39. static token_list_t *
  40. alloc_token_list(char const * str);
  41. /* = = = END-STATIC-FORWARD = = = */
  42. static void
  43. copy_cooked(ch_t** ppDest, char const ** ppSrc)
  44. {
  45. ch_t* pDest = (ch_t*)*ppDest;
  46. const ch_t* pSrc = (const ch_t*)(*ppSrc + 1);
  47. for (;;) {
  48. ch_t ch = *(pSrc++);
  49. switch (ch) {
  50. case NUL: *ppSrc = NULL; return;
  51. case '"': goto done;
  52. case '\\':
  53. pSrc += ao_string_cook_escape_char((char*)pSrc, (char*)&ch, 0x7F);
  54. if (ch == 0x7F)
  55. break;
  56. /* FALLTHROUGH */
  57. default:
  58. *(pDest++) = ch;
  59. }
  60. }
  61. done:
  62. *ppDest = (ch_t*)pDest; /* next spot for storing character */
  63. *ppSrc = (char const *)pSrc; /* char following closing quote */
  64. }
  65. static void
  66. copy_raw(ch_t** ppDest, char const ** ppSrc)
  67. {
  68. ch_t* pDest = *ppDest;
  69. cc_t* pSrc = (cc_t*) (*ppSrc + 1);
  70. for (;;) {
  71. ch_t ch = *(pSrc++);
  72. switch (ch) {
  73. case NUL: *ppSrc = NULL; return;
  74. case '\'': goto done;
  75. case '\\':
  76. /*
  77. * *Four* escapes are handled: newline removal, escape char
  78. * quoting and apostrophe quoting
  79. */
  80. switch (*pSrc) {
  81. case NUL: *ppSrc = NULL; return;
  82. case '\r':
  83. if (*(++pSrc) == NL)
  84. ++pSrc;
  85. continue;
  86. case NL:
  87. ++pSrc;
  88. continue;
  89. case '\'':
  90. ch = '\'';
  91. /* FALLTHROUGH */
  92. case '\\':
  93. ++pSrc;
  94. break;
  95. }
  96. /* FALLTHROUGH */
  97. default:
  98. *(pDest++) = ch;
  99. }
  100. }
  101. done:
  102. *ppDest = pDest; /* next spot for storing character */
  103. *ppSrc = (char const *) pSrc; /* char following closing quote */
  104. }
  105. static token_list_t *
  106. alloc_token_list(char const * str)
  107. {
  108. token_list_t * res;
  109. int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
  110. if (str == NULL) goto enoent_res;
  111. /*
  112. * Trim leading white space. Use "ENOENT" and a NULL return to indicate
  113. * an empty string was passed.
  114. */
  115. str = SPN_WHITESPACE_CHARS(str);
  116. if (*str == NUL) goto enoent_res;
  117. /*
  118. * Take an approximate count of tokens. If no quoted strings are used,
  119. * it will be accurate. If quoted strings are used, it will be a little
  120. * high and we'll squander the space for a few extra pointers.
  121. */
  122. {
  123. char const * pz = str;
  124. do {
  125. max_token_ct++;
  126. pz = BRK_WHITESPACE_CHARS(pz+1);
  127. pz = SPN_WHITESPACE_CHARS(pz);
  128. } while (*pz != NUL);
  129. res = malloc(sizeof(*res) + (size_t)(pz - str)
  130. + ((size_t)max_token_ct * sizeof(ch_t*)));
  131. }
  132. if (res == NULL)
  133. errno = ENOMEM;
  134. else res->tkn_list[0] = (ch_t*)(res->tkn_list + (max_token_ct - 1));
  135. return res;
  136. enoent_res:
  137. errno = ENOENT;
  138. return NULL;
  139. }
  140. /*=export_func ao_string_tokenize
  141. *
  142. * what: tokenize an input string
  143. *
  144. * arg: + char const* + string + string to be tokenized +
  145. *
  146. * ret_type: token_list_t*
  147. * ret_desc: pointer to a structure that lists each token
  148. *
  149. * doc:
  150. *
  151. * This function will convert one input string into a list of strings.
  152. * The list of strings is derived by separating the input based on
  153. * white space separation. However, if the input contains either single
  154. * or double quote characters, then the text after that character up to
  155. * a matching quote will become the string in the list.
  156. *
  157. * The returned pointer should be deallocated with @code{free(3C)} when
  158. * are done using the data. The data are placed in a single block of
  159. * allocated memory. Do not deallocate individual token/strings.
  160. *
  161. * The structure pointed to will contain at least these two fields:
  162. * @table @samp
  163. * @item tkn_ct
  164. * The number of tokens found in the input string.
  165. * @item tok_list
  166. * An array of @code{tkn_ct + 1} pointers to substring tokens, with
  167. * the last pointer set to NULL.
  168. * @end table
  169. *
  170. * There are two types of quoted strings: single quoted (@code{'}) and
  171. * double quoted (@code{"}). Singly quoted strings are fairly raw in that
  172. * escape characters (@code{\\}) are simply another character, except when
  173. * preceding the following characters:
  174. * @example
  175. * @code{\\} double backslashes reduce to one
  176. * @code{'} incorporates the single quote into the string
  177. * @code{\n} suppresses both the backslash and newline character
  178. * @end example
  179. *
  180. * Double quote strings are formed according to the rules of string
  181. * constants in ANSI-C programs.
  182. *
  183. * example:
  184. * @example
  185. * #include <stdlib.h>
  186. * int ix;
  187. * token_list_t* ptl = ao_string_tokenize(some_string)
  188. * for (ix = 0; ix < ptl->tkn_ct; ix++)
  189. * do_something_with_tkn(ptl->tkn_list[ix]);
  190. * free(ptl);
  191. * @end example
  192. * Note that everything is freed with the one call to @code{free(3C)}.
  193. *
  194. * err:
  195. * NULL is returned and @code{errno} will be set to indicate the problem:
  196. * @itemize @bullet
  197. * @item
  198. * @code{EINVAL} - There was an unterminated quoted string.
  199. * @item
  200. * @code{ENOENT} - The input string was empty.
  201. * @item
  202. * @code{ENOMEM} - There is not enough memory.
  203. * @end itemize
  204. =*/
  205. token_list_t*
  206. ao_string_tokenize(char const* str)
  207. {
  208. token_list_t* res = alloc_token_list(str);
  209. ch_t* pzDest;
  210. /*
  211. * Now copy each token into the output buffer.
  212. */
  213. if (res == NULL)
  214. return res;
  215. pzDest = (ch_t*)(res->tkn_list[0]);
  216. res->tkn_ct = 0;
  217. do {
  218. res->tkn_list[ res->tkn_ct++ ] = pzDest;
  219. for (;;) {
  220. int ch = (ch_t)*str;
  221. if (IS_WHITESPACE_CHAR(ch)) {
  222. found_white_space:
  223. str = SPN_WHITESPACE_CHARS(str+1);
  224. break;
  225. }
  226. switch (ch) {
  227. case '"':
  228. copy_cooked(&pzDest, &str);
  229. if (str == NULL) {
  230. free(res);
  231. errno = EINVAL;
  232. return NULL;
  233. }
  234. if (IS_WHITESPACE_CHAR(*str))
  235. goto found_white_space;
  236. break;
  237. case '\'':
  238. copy_raw(&pzDest, &str);
  239. if (str == NULL) {
  240. free(res);
  241. errno = EINVAL;
  242. return NULL;
  243. }
  244. if (IS_WHITESPACE_CHAR(*str))
  245. goto found_white_space;
  246. break;
  247. case NUL:
  248. goto copy_done;
  249. default:
  250. str++;
  251. *(pzDest++) = (unsigned char)ch;
  252. }
  253. } copy_done:;
  254. /*
  255. * NUL terminate the last token and see if we have any more tokens.
  256. */
  257. *(pzDest++) = NUL;
  258. } while (*str != NUL);
  259. res->tkn_list[ res->tkn_ct ] = NULL;
  260. return res;
  261. }
  262. #ifdef TEST
  263. #include <stdio.h>
  264. #include <string.h>
  265. int
  266. main(int argc, char** argv)
  267. {
  268. if (argc == 1) {
  269. printf("USAGE: %s arg [ ... ]\n", *argv);
  270. return 1;
  271. }
  272. while (--argc > 0) {
  273. char* arg = *(++argv);
  274. token_list_t* p = ao_string_tokenize(arg);
  275. if (p == NULL) {
  276. printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
  277. arg, errno, strerror(errno));
  278. } else {
  279. int ix = 0;
  280. printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
  281. do {
  282. printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]);
  283. } while (++ix < p->tkn_ct);
  284. free(p);
  285. }
  286. }
  287. return 0;
  288. }
  289. #endif
  290. /** @}
  291. *
  292. * Local Variables:
  293. * mode: C
  294. * c-file-style: "stroustrup"
  295. * indent-tabs-mode: nil
  296. * End:
  297. * end of autoopts/tokenize.c */