is_csv.c 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. /*-
  2. * Copyright (c) 2019 Christos Zoulas
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  15. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  16. * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  18. * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  19. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  20. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  21. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  22. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  23. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  24. * POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. /*
  27. * Parse CSV object serialization format (RFC-4180, RFC-7111)
  28. */
  29. #ifndef TEST
  30. #include "file.h"
  31. #ifndef lint
  32. FILE_RCSID("@(#)$File: is_csv.c,v 1.4 2019/06/26 20:31:31 christos Exp $")
  33. #endif
  34. #include <string.h>
  35. #include "magic.h"
  36. #else
  37. #include <sys/types.h>
  38. #endif
  39. #ifdef DEBUG
  40. #include <stdio.h>
  41. #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
  42. #else
  43. #define DPRINTF(fmt, ...)
  44. #endif
  45. /*
  46. * if CSV_LINES == 0:
  47. * check all the lines in the buffer
  48. * otherwise:
  49. * check only up-to the number of lines specified
  50. *
  51. * the last line count is always ignored if it does not end in CRLF
  52. */
  53. #ifndef CSV_LINES
  54. #define CSV_LINES 10
  55. #endif
  56. static int csv_parse(const unsigned char *, const unsigned char *);
  57. static const unsigned char *
  58. eatquote(const unsigned char *uc, const unsigned char *ue)
  59. {
  60. int quote = 0;
  61. while (uc < ue) {
  62. unsigned char c = *uc++;
  63. if (c != '"') {
  64. // We already got one, done.
  65. if (quote) {
  66. return --uc;
  67. }
  68. continue;
  69. }
  70. if (quote) {
  71. // quote-quote escapes
  72. quote = 0;
  73. continue;
  74. }
  75. // first quote
  76. quote = 1;
  77. }
  78. return ue;
  79. }
  80. static int
  81. csv_parse(const unsigned char *uc, const unsigned char *ue)
  82. {
  83. size_t nf = 0, tf = 0, nl = 0;
  84. while (uc < ue) {
  85. unsigned char c;
  86. switch (c = *uc++) {
  87. case '"':
  88. // Eat until the matching quote
  89. uc = eatquote(uc, ue);
  90. break;
  91. case ',':
  92. nf++;
  93. break;
  94. case '\n':
  95. DPRINTF("%zu %zu %zu\n", nl, nf, tf);
  96. nl++;
  97. #if CSV_LINES
  98. if (nl == CSV_LINES)
  99. return tf != 0 && tf == nf;
  100. #endif
  101. if (tf == 0) {
  102. // First time and no fields, give up
  103. if (nf == 0)
  104. return 0;
  105. // First time, set the number of fields
  106. tf = nf;
  107. } else if (tf != nf) {
  108. // Field number mismatch, we are done.
  109. return 0;
  110. }
  111. nf = 0;
  112. break;
  113. default:
  114. break;
  115. }
  116. }
  117. return tf && nl > 2;
  118. }
  119. #ifndef TEST
  120. int
  121. file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text)
  122. {
  123. const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
  124. const unsigned char *ue = uc + b->flen;
  125. int mime = ms->flags & MAGIC_MIME;
  126. if (!looks_text)
  127. return 0;
  128. if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
  129. return 0;
  130. if (!csv_parse(uc, ue))
  131. return 0;
  132. if (mime == MAGIC_MIME_ENCODING)
  133. return 1;
  134. if (mime) {
  135. if (file_printf(ms, "application/csv") == -1)
  136. return -1;
  137. return 1;
  138. }
  139. if (file_printf(ms, "CSV text") == -1)
  140. return -1;
  141. return 1;
  142. }
  143. #else
  144. #include <sys/types.h>
  145. #include <sys/stat.h>
  146. #include <stdio.h>
  147. #include <fcntl.h>
  148. #include <unistd.h>
  149. #include <stdlib.h>
  150. #include <stdint.h>
  151. #include <err.h>
  152. int
  153. main(int argc, char *argv[])
  154. {
  155. int fd, rv;
  156. struct stat st;
  157. unsigned char *p;
  158. if ((fd = open(argv[1], O_RDONLY)) == -1)
  159. err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
  160. if (fstat(fd, &st) == -1)
  161. err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
  162. if ((p = malloc(st.st_size)) == NULL)
  163. err(EXIT_FAILURE, "Can't allocate %jd bytes",
  164. (intmax_t)st.st_size);
  165. if (read(fd, p, st.st_size) != st.st_size)
  166. err(EXIT_FAILURE, "Can't read %jd bytes",
  167. (intmax_t)st.st_size);
  168. printf("is csv %d\n", csv_parse(p, p + st.st_size));
  169. return 0;
  170. }
  171. #endif