tokenize.c 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. /*
  2. * This file defines the string_tokenize interface
  3. * Time-stamp: "2005-04-25 18:47:21 bkorb"
  4. *
  5. * string_tokenize copyright 2005 Bruce Korb
  6. *
  7. * string_tokenize is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * string_tokenize is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with string_tokenize; if not, write to:
  19. * The Free Software Foundation, Inc.,
  20. * 51 Franklin Street, Fifth Floor,
  21. * Boston, MA 02110-1301, USA.
  22. */
  23. #include <ctype.h>
  24. #include <errno.h>
  25. #include <stdlib.h>
  26. #define cc_t const unsigned char
  27. #define ch_t unsigned char
  28. /* = = = START-STATIC-FORWARD = = = */
  29. /* static forward declarations maintained by :mkfwd */
  30. static void
  31. copy_cooked( ch_t** ppDest, cc_t** ppSrc );
  32. static void
  33. copy_raw( ch_t** ppDest, cc_t** ppSrc );
  34. /* = = = END-STATIC-FORWARD = = = */
  35. static void
  36. copy_cooked( ch_t** ppDest, cc_t** ppSrc )
  37. {
  38. ch_t* pDest = (ch_t*)*ppDest;
  39. const ch_t* pSrc = (const ch_t*)(*ppSrc + 1);
  40. for (;;) {
  41. ch_t ch = *(pSrc++);
  42. switch (ch) {
  43. case NUL: *ppSrc = NULL; return;
  44. case '"': goto done;
  45. case '\\':
  46. pSrc += ao_string_cook_escape_char( (char*)pSrc, (char*)&ch, 0x7F );
  47. if (ch == 0x7F)
  48. break;
  49. /* FALLTHROUGH */
  50. default:
  51. *(pDest++) = ch;
  52. }
  53. }
  54. done:
  55. *ppDest = (ch_t*)pDest; /* next spot for storing character */
  56. *ppSrc = (ch_t*)pSrc; /* char following closing quote */
  57. }
  58. static void
  59. copy_raw( ch_t** ppDest, cc_t** ppSrc )
  60. {
  61. ch_t* pDest = *ppDest;
  62. cc_t* pSrc = *ppSrc + 1;
  63. for (;;) {
  64. ch_t ch = *(pSrc++);
  65. switch (ch) {
  66. case NUL: *ppSrc = NULL; return;
  67. case '\'': goto done;
  68. case '\\':
  69. /*
  70. * *Four* escapes are handled: newline removal, escape char
  71. * quoting and apostrophe quoting
  72. */
  73. switch (*pSrc) {
  74. case NUL: *ppSrc = NULL; return;
  75. case '\r':
  76. if (*(++pSrc) == '\n')
  77. ++pSrc;
  78. continue;
  79. case '\n':
  80. ++pSrc;
  81. continue;
  82. case '\'':
  83. ch = '\'';
  84. /* FALLTHROUGH */
  85. case '\\':
  86. ++pSrc;
  87. break;
  88. }
  89. /* FALLTHROUGH */
  90. default:
  91. *(pDest++) = ch;
  92. }
  93. }
  94. done:
  95. *ppDest = pDest; /* next spot for storing character */
  96. *ppSrc = pSrc; /* char following closing quote */
  97. }
  98. /*=export_func ao_string_tokenize
  99. *
  100. * what: tokenize an input string
  101. *
  102. * arg: + const char* + string + string to be tokenized +
  103. *
  104. * ret_type: token_list_t*
  105. * ret_desc: pointer to a structure that lists each token
  106. *
  107. * doc:
  108. *
  109. * This function will convert one input string into a list of strings.
  110. * The list of strings is derived by separating the input based on
  111. * white space separation. However, if the input contains either single
  112. * or double quote characters, then the text after that character up to
  113. * a matching quote will become the string in the list.
  114. *
  115. * The returned pointer should be deallocated with @code{free(3C)} when
  116. * are done using the data. The data are placed in a single block of
  117. * allocated memory. Do not deallocate individual token/strings.
  118. *
  119. * The structure pointed to will contain at least these two fields:
  120. * @table @samp
  121. * @item tkn_ct
  122. * The number of tokens found in the input string.
  123. * @item tok_list
  124. * An array of @code{tkn_ct + 1} pointers to substring tokens, with
  125. * the last pointer set to NULL.
  126. * @end table
  127. *
  128. * There are two types of quoted strings: single quoted (@code{'}) and
  129. * double quoted (@code{"}). Singly quoted strings are fairly raw in that
  130. * escape characters (@code{\\}) are simply another character, except when
  131. * preceding the following characters:
  132. * @example
  133. * @code{\\} double backslashes reduce to one
  134. * @code{'} incorporates the single quote into the string
  135. * @code{\n} suppresses both the backslash and newline character
  136. * @end example
  137. *
  138. * Double quote strings are formed according to the rules of string
  139. * constants in ANSI-C programs.
  140. *
  141. * example:
  142. * @example
  143. * #include <stdlib.h>
  144. * int ix;
  145. * token_list_t* ptl = ao_string_tokenize( some_string )
  146. * for (ix = 0; ix < ptl->tkn_ct; ix++)
  147. * do_something_with_tkn( ptl->tkn_list[ix] );
  148. * free( ptl );
  149. * @end example
  150. * Note that everything is freed with the one call to @code{free(3C)}.
  151. *
  152. * err:
  153. * NULL is returned and @code{errno} will be set to indicate the problem:
  154. * @itemize @bullet
  155. * @item
  156. * @code{EINVAL} - There was an unterminated quoted string.
  157. * @item
  158. * @code{ENOENT} - The input string was empty.
  159. * @item
  160. * @code{ENOMEM} - There is not enough memory.
  161. * @end itemize
  162. =*/
  163. token_list_t*
  164. ao_string_tokenize( const char* str )
  165. {
  166. int max_token_ct = 1; /* allow for trailing NUL on string */
  167. token_list_t* res;
  168. if (str == NULL) goto bogus_str;
  169. /*
  170. * Trim leading white space. Use "ENOENT" and a NULL return to indicate
  171. * an empty string was passed.
  172. */
  173. while (isspace( *str )) str++;
  174. if (*str == NUL) {
  175. bogus_str:
  176. errno = ENOENT;
  177. return NULL;
  178. }
  179. /*
  180. * Take an approximate count of tokens. If no quoted strings are used,
  181. * it will be accurate. If quoted strings are used, it will be a little
  182. * high and we'll squander the space for a few extra pointers.
  183. */
  184. {
  185. cc_t* pz = (cc_t*)str;
  186. do {
  187. max_token_ct++;
  188. while (! isspace( *++pz ))
  189. if (*pz == NUL) goto found_nul;
  190. while (isspace( *pz )) pz++;
  191. } while (*pz != NUL);
  192. found_nul:
  193. ;
  194. }
  195. res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) );
  196. if (res == NULL) {
  197. errno = ENOMEM;
  198. return res;
  199. }
  200. /*
  201. * Now copy each token into the output buffer.
  202. */
  203. {
  204. ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1));
  205. res->tkn_ct = 0;
  206. do {
  207. res->tkn_list[ res->tkn_ct++ ] = pzDest;
  208. for (;;) {
  209. char ch = *str;
  210. if (isspace( ch )) {
  211. found_white_space:
  212. while (isspace( *++str )) ;
  213. break;
  214. }
  215. switch (ch) {
  216. case '"':
  217. copy_cooked( &pzDest, (cc_t**)&str );
  218. if (str == NULL) {
  219. free(res);
  220. errno = EINVAL;
  221. return NULL;
  222. }
  223. if (isspace( *str ))
  224. goto found_white_space;
  225. break;
  226. case '\'':
  227. copy_raw( &pzDest, (cc_t**)&str );
  228. if (str == NULL) {
  229. free(res);
  230. errno = EINVAL;
  231. return NULL;
  232. }
  233. if (isspace( *str ))
  234. goto found_white_space;
  235. break;
  236. case NUL:
  237. goto copy_done;
  238. default:
  239. str++;
  240. *(pzDest++) = ch;
  241. }
  242. } copy_done:;
  243. /*
  244. * NUL terminate the last token and see if we have any more tokens.
  245. */
  246. *(pzDest++) = NUL;
  247. } while (*str != NUL);
  248. res->tkn_list[ res->tkn_ct ] = NULL;
  249. }
  250. return res;
  251. }
  252. #ifdef TEST
  253. #include <stdio.h>
  254. #include <string.h>
  255. int
  256. main( int argc, char** argv )
  257. {
  258. if (argc == 1) {
  259. printf("USAGE: %s arg [ ... ]\n", *argv);
  260. return 1;
  261. }
  262. while (--argc > 0) {
  263. char* arg = *(++argv);
  264. token_list_t* p = ao_string_tokenize( arg );
  265. if (p == NULL) {
  266. printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
  267. arg, errno, strerror( errno ));
  268. } else {
  269. int ix = 0;
  270. printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct );
  271. do {
  272. printf( " %3d: ``%s''\n", ix+1, p->tkn_list[ix] );
  273. } while (++ix < p->tkn_ct);
  274. free(p);
  275. }
  276. }
  277. return 0;
  278. }
  279. #endif
  280. /*
  281. * Local Variables:
  282. * mode: C
  283. * c-file-style: "stroustrup"
  284. * tab-width: 4
  285. * indent-tabs-mode: nil
  286. * End:
  287. * end of autoopts/tokenize.c */