is_csv.c 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. /*-
  2. * Copyright (c) 2019 Christos Zoulas
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  15. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  16. * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  18. * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  19. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  20. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  21. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  22. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  23. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  24. * POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. /*
  27. * Parse CSV object serialization format (RFC-4180, RFC-7111)
  28. */
  29. #ifndef TEST
  30. #include "file.h"
  31. #ifndef lint
  32. FILE_RCSID("@(#)$File: is_csv.c,v 1.13 2023/07/17 16:08:17 christos Exp $")
  33. #endif
  34. #include <string.h>
  35. #include "magic.h"
  36. #else
  37. #include <sys/types.h>
  38. #endif
  39. #ifdef DEBUG
  40. #include <stdio.h>
  41. #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
  42. #else
  43. #define DPRINTF(fmt, ...)
  44. #endif
  45. /*
  46. * if CSV_LINES == 0:
  47. * check all the lines in the buffer
  48. * otherwise:
  49. * check only up-to the number of lines specified
  50. *
  51. * the last line count is always ignored if it does not end in CRLF
  52. */
  53. #ifndef CSV_LINES
  54. #define CSV_LINES 10
  55. #endif
  56. static int csv_parse(const unsigned char *, const unsigned char *);
  57. static const unsigned char *
  58. eatquote(const unsigned char *uc, const unsigned char *ue)
  59. {
  60. int quote = 0;
  61. while (uc < ue) {
  62. unsigned char c = *uc++;
  63. if (c != '"') {
  64. // We already got one, done.
  65. if (quote) {
  66. return --uc;
  67. }
  68. continue;
  69. }
  70. if (quote) {
  71. // quote-quote escapes
  72. quote = 0;
  73. continue;
  74. }
  75. // first quote
  76. quote = 1;
  77. }
  78. return ue;
  79. }
  80. static int
  81. csv_parse(const unsigned char *uc, const unsigned char *ue)
  82. {
  83. size_t nf = 0, tf = 0, nl = 0;
  84. while (uc < ue) {
  85. switch (*uc++) {
  86. case '"':
  87. // Eat until the matching quote
  88. uc = eatquote(uc, ue);
  89. break;
  90. case ',':
  91. nf++;
  92. break;
  93. case '\n':
  94. DPRINTF("%zu %zu %zu\n", nl, nf, tf);
  95. nl++;
  96. #if CSV_LINES
  97. if (nl == CSV_LINES)
  98. return tf != 0 && tf == nf;
  99. #endif
  100. if (tf == 0) {
  101. // First time and no fields, give up
  102. if (nf == 0)
  103. return 0;
  104. // First time, set the number of fields
  105. tf = nf;
  106. } else if (tf != nf) {
  107. // Field number mismatch, we are done.
  108. return 0;
  109. }
  110. nf = 0;
  111. break;
  112. default:
  113. break;
  114. }
  115. }
  116. return tf && nl >= 2;
  117. }
  118. #ifndef TEST
  119. int
  120. file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
  121. const char *code)
  122. {
  123. const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
  124. const unsigned char *ue = uc + b->flen;
  125. int mime = ms->flags & MAGIC_MIME;
  126. if (!looks_text)
  127. return 0;
  128. if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
  129. return 0;
  130. if (!csv_parse(uc, ue))
  131. return 0;
  132. if (mime == MAGIC_MIME_ENCODING)
  133. return 1;
  134. if (mime) {
  135. if (file_printf(ms, "text/csv") == -1)
  136. return -1;
  137. return 1;
  138. }
  139. if (file_printf(ms, "CSV %s%stext", code ? code : "",
  140. code ? " " : "") == -1)
  141. return -1;
  142. return 1;
  143. }
  144. #else
  145. #include <sys/types.h>
  146. #include <sys/stat.h>
  147. #include <stdio.h>
  148. #include <fcntl.h>
  149. #include <unistd.h>
  150. #include <stdlib.h>
  151. #include <stdint.h>
  152. #include <err.h>
  153. int
  154. main(int argc, char *argv[])
  155. {
  156. int fd;
  157. struct stat st;
  158. unsigned char *p;
  159. if ((fd = open(argv[1], O_RDONLY)) == -1)
  160. err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
  161. if (fstat(fd, &st) == -1)
  162. err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
  163. if ((p = CAST(char *, malloc(st.st_size))) == NULL)
  164. err(EXIT_FAILURE, "Can't allocate %jd bytes",
  165. (intmax_t)st.st_size);
  166. if (read(fd, p, st.st_size) != st.st_size)
  167. err(EXIT_FAILURE, "Can't read %jd bytes",
  168. (intmax_t)st.st_size);
  169. printf("is csv %d\n", csv_parse(p, p + st.st_size));
  170. return 0;
  171. }
  172. #endif