123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339 |
- /** \file tokenize.c
- *
- * Tokenize a string, accommodating quoted strings.
- *
- * @addtogroup autoopts
- * @{
- */
- /*
- * This file defines the string_tokenize interface
- * This file is part of AutoOpts, a companion to AutoGen.
- * AutoOpts is free software.
- * AutoOpts is Copyright (C) 1992-2015 by Bruce Korb - all rights reserved
- *
- * AutoOpts is available under any one of two licenses. The license
- * in use must be one of these two and the choice is under the control
- * of the user of the license.
- *
- * The GNU Lesser General Public License, version 3 or later
- * See the files "COPYING.lgplv3" and "COPYING.gplv3"
- *
- * The Modified Berkeley Software Distribution License
- * See the file "COPYING.mbsd"
- *
- * These files have the following sha256 sums:
- *
- * 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3
- * 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3
- * 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd
- */
- #include <errno.h>
- #include <stdlib.h>
- #define cc_t const unsigned char
- #define ch_t unsigned char
- /* = = = START-STATIC-FORWARD = = = */
- static void
- copy_cooked(ch_t ** ppDest, char const ** ppSrc);
- static void
- copy_raw(ch_t ** ppDest, char const ** ppSrc);
- static token_list_t *
- alloc_token_list(char const * str);
- /* = = = END-STATIC-FORWARD = = = */
- static void
- copy_cooked(ch_t ** ppDest, char const ** ppSrc)
- {
- ch_t * pDest = (ch_t *)*ppDest;
- const ch_t * pSrc = (const ch_t *)(*ppSrc + 1);
- for (;;) {
- ch_t ch = *(pSrc++);
- switch (ch) {
- case NUL: *ppSrc = NULL; return;
- case '"': goto done;
- case '\\':
- pSrc += ao_string_cook_escape_char((char *)pSrc, (char *)&ch, 0x7F);
- if (ch == 0x7F)
- break;
- /* FALLTHROUGH */
- default:
- *(pDest++) = ch;
- }
- }
- done:
- *ppDest = (ch_t *)pDest; /* next spot for storing character */
- *ppSrc = (char const *)pSrc; /* char following closing quote */
- }
- static void
- copy_raw(ch_t ** ppDest, char const ** ppSrc)
- {
- ch_t * pDest = *ppDest;
- cc_t * pSrc = (cc_t *) (*ppSrc + 1);
- for (;;) {
- ch_t ch = *(pSrc++);
- switch (ch) {
- case NUL: *ppSrc = NULL; return;
- case '\'': goto done;
- case '\\':
- /*
- * *Four* escapes are handled: newline removal, escape char
- * quoting and apostrophe quoting
- */
- switch (*pSrc) {
- case NUL: *ppSrc = NULL; return;
- case '\r':
- if (*(++pSrc) == NL)
- ++pSrc;
- continue;
- case NL:
- ++pSrc;
- continue;
- case '\'':
- ch = '\'';
- /* FALLTHROUGH */
- case '\\':
- ++pSrc;
- break;
- }
- /* FALLTHROUGH */
- default:
- *(pDest++) = ch;
- }
- }
- done:
- *ppDest = pDest; /* next spot for storing character */
- *ppSrc = (char const *) pSrc; /* char following closing quote */
- }
- static token_list_t *
- alloc_token_list(char const * str)
- {
- token_list_t * res;
- int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
- if (str == NULL) goto enoent_res;
- /*
- * Trim leading white space. Use "ENOENT" and a NULL return to indicate
- * an empty string was passed.
- */
- str = SPN_WHITESPACE_CHARS(str);
- if (*str == NUL) goto enoent_res;
- /*
- * Take an approximate count of tokens. If no quoted strings are used,
- * it will be accurate. If quoted strings are used, it will be a little
- * high and we'll squander the space for a few extra pointers.
- */
- {
- char const * pz = str;
- do {
- max_token_ct++;
- pz = BRK_WHITESPACE_CHARS(pz+1);
- pz = SPN_WHITESPACE_CHARS(pz);
- } while (*pz != NUL);
- res = malloc(sizeof(*res) + (size_t)(pz - str)
- + ((size_t)max_token_ct * sizeof(ch_t *)));
- }
- if (res == NULL)
- errno = ENOMEM;
- else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
- return res;
- enoent_res:
- errno = ENOENT;
- return NULL;
- }
- /*=export_func ao_string_tokenize
- *
- * what: tokenize an input string
- *
- * arg: + char const * + string + string to be tokenized +
- *
- * ret_type: token_list_t *
- * ret_desc: pointer to a structure that lists each token
- *
- * doc:
- *
- * This function will convert one input string into a list of strings.
- * The list of strings is derived by separating the input based on
- * white space separation. However, if the input contains either single
- * or double quote characters, then the text after that character up to
- * a matching quote will become the string in the list.
- *
- * The returned pointer should be deallocated with @code{free(3C)} when
- * are done using the data. The data are placed in a single block of
- * allocated memory. Do not deallocate individual token/strings.
- *
- * The structure pointed to will contain at least these two fields:
- * @table @samp
- * @item tkn_ct
- * The number of tokens found in the input string.
- * @item tok_list
- * An array of @code{tkn_ct + 1} pointers to substring tokens, with
- * the last pointer set to NULL.
- * @end table
- *
- * There are two types of quoted strings: single quoted (@code{'}) and
- * double quoted (@code{"}). Singly quoted strings are fairly raw in that
- * escape characters (@code{\\}) are simply another character, except when
- * preceding the following characters:
- * @example
- * @code{\\} double backslashes reduce to one
- * @code{'} incorporates the single quote into the string
- * @code{\n} suppresses both the backslash and newline character
- * @end example
- *
- * Double quote strings are formed according to the rules of string
- * constants in ANSI-C programs.
- *
- * example:
- * @example
- * #include <stdlib.h>
- * int ix;
- * token_list_t * ptl = ao_string_tokenize(some_string)
- * for (ix = 0; ix < ptl->tkn_ct; ix++)
- * do_something_with_tkn(ptl->tkn_list[ix]);
- * free(ptl);
- * @end example
- * Note that everything is freed with the one call to @code{free(3C)}.
- *
- * err:
- * NULL is returned and @code{errno} will be set to indicate the problem:
- * @itemize @bullet
- * @item
- * @code{EINVAL} - There was an unterminated quoted string.
- * @item
- * @code{ENOENT} - The input string was empty.
- * @item
- * @code{ENOMEM} - There is not enough memory.
- * @end itemize
- =*/
- token_list_t *
- ao_string_tokenize(char const * str)
- {
- token_list_t * res = alloc_token_list(str);
- ch_t * pzDest;
- /*
- * Now copy each token into the output buffer.
- */
- if (res == NULL)
- return res;
- pzDest = (ch_t *)(res->tkn_list[0]);
- res->tkn_ct = 0;
- do {
- res->tkn_list[ res->tkn_ct++ ] = pzDest;
- for (;;) {
- int ch = (ch_t)*str;
- if (IS_WHITESPACE_CHAR(ch)) {
- found_white_space:
- str = SPN_WHITESPACE_CHARS(str+1);
- break;
- }
- switch (ch) {
- case '"':
- copy_cooked(&pzDest, &str);
- if (str == NULL) {
- free(res);
- errno = EINVAL;
- return NULL;
- }
- if (IS_WHITESPACE_CHAR(*str))
- goto found_white_space;
- break;
- case '\'':
- copy_raw(&pzDest, &str);
- if (str == NULL) {
- free(res);
- errno = EINVAL;
- return NULL;
- }
- if (IS_WHITESPACE_CHAR(*str))
- goto found_white_space;
- break;
- case NUL:
- goto copy_done;
- default:
- str++;
- *(pzDest++) = (unsigned char)ch;
- }
- } copy_done:;
- /*
- * NUL terminate the last token and see if we have any more tokens.
- */
- *(pzDest++) = NUL;
- } while (*str != NUL);
- res->tkn_list[ res->tkn_ct ] = NULL;
- return res;
- }
- #ifdef TEST
- #include <stdio.h>
- #include <string.h>
- int
- main(int argc, char ** argv)
- {
- if (argc == 1) {
- printf("USAGE: %s arg [ ... ]\n", *argv);
- return 1;
- }
- while (--argc > 0) {
- char * arg = *(++argv);
- token_list_t * p = ao_string_tokenize(arg);
- if (p == NULL) {
- printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
- arg, errno, strerror(errno));
- } else {
- int ix = 0;
- printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
- do {
- printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]);
- } while (++ix < p->tkn_ct);
- free(p);
- }
- }
- return 0;
- }
- #endif
- /** @}
- *
- * Local Variables:
- * mode: C
- * c-file-style: "stroustrup"
- * indent-tabs-mode: nil
- * End:
- * end of autoopts/tokenize.c */
|