is_csv.c 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. /*-
  2. * Copyright (c) 2019 Christos Zoulas
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  15. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  16. * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  18. * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  19. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  20. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  21. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  22. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  23. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  24. * POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. /*
  27. * Parse CSV object serialization format (RFC-4180, RFC-7111)
  28. */
  29. #ifndef TEST
  30. #include "file.h"
  31. #ifndef lint
  32. FILE_RCSID("@(#)$File: is_csv.c,v 1.15 2024/05/18 15:16:13 christos Exp $")
  33. #endif
  34. #include <string.h>
  35. #include "magic.h"
  36. #else
  37. #define CAST(a, b) ((a)(b))
  38. #include <sys/types.h>
  39. #endif
  40. #ifdef DEBUG
  41. #include <stdio.h>
  42. #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
  43. #else
  44. #define DPRINTF(fmt, ...)
  45. #endif
  46. /*
  47. * if CSV_LINES == 0:
  48. * check all the lines in the buffer
  49. * otherwise:
  50. * check only up-to the number of lines specified
  51. *
  52. * the last line count is always ignored if it does not end in CRLF
  53. */
  54. #ifndef CSV_LINES
  55. #define CSV_LINES 10
  56. #endif
  57. static int csv_parse(const unsigned char *, const unsigned char *);
  58. static const unsigned char *
  59. eatquote(const unsigned char *uc, const unsigned char *ue)
  60. {
  61. int quote = 0;
  62. while (uc < ue) {
  63. unsigned char c = *uc++;
  64. if (c != '"') {
  65. // We already got one, done.
  66. if (quote) {
  67. return --uc;
  68. }
  69. continue;
  70. }
  71. if (quote) {
  72. // quote-quote escapes
  73. quote = 0;
  74. continue;
  75. }
  76. // first quote
  77. quote = 1;
  78. }
  79. return ue;
  80. }
  81. static int
  82. csv_parse(const unsigned char *uc, const unsigned char *ue)
  83. {
  84. size_t nf = 0, tf = 0, nl = 0;
  85. while (uc < ue) {
  86. switch (*uc++) {
  87. case '"':
  88. // Eat until the matching quote
  89. uc = eatquote(uc, ue);
  90. break;
  91. case ',':
  92. nf++;
  93. break;
  94. case '\n':
  95. DPRINTF("%zu %zu %zu\n", nl, nf, tf);
  96. nl++;
  97. #if CSV_LINES
  98. if (nl == CSV_LINES)
  99. return tf > 1 && tf == nf;
  100. #endif
  101. if (tf == 0) {
  102. // First time and no fields, give up
  103. if (nf == 0)
  104. return 0;
  105. // First time, set the number of fields
  106. tf = nf;
  107. } else if (tf != nf) {
  108. // Field number mismatch, we are done.
  109. return 0;
  110. }
  111. nf = 0;
  112. break;
  113. default:
  114. break;
  115. }
  116. }
  117. return tf > 1 && nl >= 2;
  118. }
  119. #ifndef TEST
  120. int
  121. file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
  122. const char *code)
  123. {
  124. const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
  125. const unsigned char *ue = uc + b->flen;
  126. int mime = ms->flags & MAGIC_MIME;
  127. if (!looks_text)
  128. return 0;
  129. if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
  130. return 0;
  131. if (!csv_parse(uc, ue))
  132. return 0;
  133. if (mime == MAGIC_MIME_ENCODING)
  134. return 1;
  135. if (mime) {
  136. if (file_printf(ms, "text/csv") == -1)
  137. return -1;
  138. return 1;
  139. }
  140. if (file_printf(ms, "CSV %s%stext", code ? code : "",
  141. code ? " " : "") == -1)
  142. return -1;
  143. return 1;
  144. }
  145. #else
  146. #include <sys/types.h>
  147. #include <sys/stat.h>
  148. #include <stdio.h>
  149. #include <fcntl.h>
  150. #include <unistd.h>
  151. #include <stdlib.h>
  152. #include <stdint.h>
  153. #include <err.h>
  154. int
  155. main(int argc, char *argv[])
  156. {
  157. int fd;
  158. struct stat st;
  159. unsigned char *p;
  160. if ((fd = open(argv[1], O_RDONLY)) == -1)
  161. err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
  162. if (fstat(fd, &st) == -1)
  163. err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
  164. if ((p = CAST(unsigned char *, malloc(st.st_size))) == NULL)
  165. err(EXIT_FAILURE, "Can't allocate %jd bytes",
  166. (intmax_t)st.st_size);
  167. if (read(fd, p, st.st_size) != st.st_size)
  168. err(EXIT_FAILURE, "Can't read %jd bytes",
  169. (intmax_t)st.st_size);
  170. printf("is csv %d\n", csv_parse(p, p + st.st_size));
  171. return 0;
  172. }
  173. #endif