is_json.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. /*-
  2. * Copyright (c) 2018 Christos Zoulas
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  15. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  16. * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  18. * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  19. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  20. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  21. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  22. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  23. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  24. * POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. /*
  27. * Parse JSON object serialization format (RFC-7159)
  28. */
  29. #ifndef TEST
  30. #include "file.h"
  31. #ifndef lint
  32. FILE_RCSID("@(#)$File: is_json.c,v 1.30 2022/09/27 19:12:40 christos Exp $")
  33. #endif
  34. #include "magic.h"
  35. #else
  36. #include <stdio.h>
  37. #include <stddef.h>
  38. #endif
  39. #include <string.h>
  40. #ifdef DEBUG
  41. #include <stdio.h>
  42. #define DPRINTF(a, b, c) \
  43. printf("%*s%s [%.2x/%c] %.*s\n", (int)lvl, "", (a), *(b), *(b), \
  44. (int)(b - c), (const char *)(c))
  45. #define __file_debugused
  46. #else
  47. #define DPRINTF(a, b, c) do { } while (/*CONSTCOND*/0)
  48. #define __file_debugused __attribute__((__unused__))
  49. #endif
  50. #define JSON_ARRAY 0
  51. #define JSON_CONSTANT 1
  52. #define JSON_NUMBER 2
  53. #define JSON_OBJECT 3
  54. #define JSON_STRING 4
  55. #define JSON_ARRAYN 5
  56. #define JSON_MAX 6
  57. /*
  58. * if JSON_COUNT != 0:
  59. * count all the objects, require that we have the whole data file
  60. * otherwise:
  61. * stop if we find an object or an array
  62. */
  63. #ifndef JSON_COUNT
  64. #define JSON_COUNT 0
  65. #endif
  66. static int json_parse(const unsigned char **, const unsigned char *, size_t *,
  67. size_t);
  68. static int
  69. json_isspace(const unsigned char uc)
  70. {
  71. switch (uc) {
  72. case ' ':
  73. case '\n':
  74. case '\r':
  75. case '\t':
  76. return 1;
  77. default:
  78. return 0;
  79. }
  80. }
  81. static int
  82. json_isdigit(unsigned char uc)
  83. {
  84. switch (uc) {
  85. case '0': case '1': case '2': case '3': case '4':
  86. case '5': case '6': case '7': case '8': case '9':
  87. return 1;
  88. default:
  89. return 0;
  90. }
  91. }
  92. static int
  93. json_isxdigit(unsigned char uc)
  94. {
  95. if (json_isdigit(uc))
  96. return 1;
  97. switch (uc) {
  98. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  99. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  100. return 1;
  101. default:
  102. return 0;
  103. }
  104. }
  105. static const unsigned char *
  106. json_skip_space(const unsigned char *uc, const unsigned char *ue)
  107. {
  108. while (uc < ue && json_isspace(*uc))
  109. uc++;
  110. return uc;
  111. }
  112. /*ARGSUSED*/
  113. static int
  114. json_parse_string(const unsigned char **ucp, const unsigned char *ue,
  115. size_t lvl __file_debugused)
  116. {
  117. const unsigned char *uc = *ucp;
  118. size_t i;
  119. DPRINTF("Parse string: ", uc, *ucp);
  120. while (uc < ue) {
  121. switch (*uc++) {
  122. case '\0':
  123. goto out;
  124. case '\\':
  125. if (uc == ue)
  126. goto out;
  127. switch (*uc++) {
  128. case '\0':
  129. goto out;
  130. case '"':
  131. case '\\':
  132. case '/':
  133. case 'b':
  134. case 'f':
  135. case 'n':
  136. case 'r':
  137. case 't':
  138. continue;
  139. case 'u':
  140. if (ue - uc < 4) {
  141. uc = ue;
  142. goto out;
  143. }
  144. for (i = 0; i < 4; i++)
  145. if (!json_isxdigit(*uc++))
  146. goto out;
  147. continue;
  148. default:
  149. goto out;
  150. }
  151. case '"':
  152. DPRINTF("Good string: ", uc, *ucp);
  153. *ucp = uc;
  154. return 1;
  155. default:
  156. continue;
  157. }
  158. }
  159. out:
  160. DPRINTF("Bad string: ", uc, *ucp);
  161. *ucp = uc;
  162. return 0;
  163. }
  164. static int
  165. json_parse_array(const unsigned char **ucp, const unsigned char *ue,
  166. size_t *st, size_t lvl)
  167. {
  168. const unsigned char *uc = *ucp;
  169. DPRINTF("Parse array: ", uc, *ucp);
  170. while (uc < ue) {
  171. uc = json_skip_space(uc, ue);
  172. if (uc == ue)
  173. goto out;
  174. if (*uc == ']')
  175. goto done;
  176. if (!json_parse(&uc, ue, st, lvl + 1))
  177. goto out;
  178. if (uc == ue)
  179. goto out;
  180. switch (*uc) {
  181. case ',':
  182. uc++;
  183. continue;
  184. case ']':
  185. done:
  186. st[JSON_ARRAYN]++;
  187. DPRINTF("Good array: ", uc, *ucp);
  188. *ucp = uc + 1;
  189. return 1;
  190. default:
  191. goto out;
  192. }
  193. }
  194. out:
  195. DPRINTF("Bad array: ", uc, *ucp);
  196. *ucp = uc;
  197. return 0;
  198. }
  199. static int
  200. json_parse_object(const unsigned char **ucp, const unsigned char *ue,
  201. size_t *st, size_t lvl)
  202. {
  203. const unsigned char *uc = *ucp;
  204. DPRINTF("Parse object: ", uc, *ucp);
  205. while (uc < ue) {
  206. uc = json_skip_space(uc, ue);
  207. if (uc == ue)
  208. goto out;
  209. if (*uc == '}') {
  210. uc++;
  211. goto done;
  212. }
  213. if (*uc++ != '"') {
  214. DPRINTF("not string", uc, *ucp);
  215. goto out;
  216. }
  217. DPRINTF("next field", uc, *ucp);
  218. if (!json_parse_string(&uc, ue, lvl)) {
  219. DPRINTF("not string", uc, *ucp);
  220. goto out;
  221. }
  222. uc = json_skip_space(uc, ue);
  223. if (uc == ue)
  224. goto out;
  225. if (*uc++ != ':') {
  226. DPRINTF("not colon", uc, *ucp);
  227. goto out;
  228. }
  229. if (!json_parse(&uc, ue, st, lvl + 1)) {
  230. DPRINTF("not json", uc, *ucp);
  231. goto out;
  232. }
  233. if (uc == ue)
  234. goto out;
  235. switch (*uc++) {
  236. case ',':
  237. continue;
  238. case '}': /* { */
  239. done:
  240. DPRINTF("Good object: ", uc, *ucp);
  241. *ucp = uc;
  242. return 1;
  243. default:
  244. DPRINTF("not more", uc, *ucp);
  245. *ucp = uc - 1;
  246. goto out;
  247. }
  248. }
  249. out:
  250. DPRINTF("Bad object: ", uc, *ucp);
  251. *ucp = uc;
  252. return 0;
  253. }
  254. /*ARGSUSED*/
  255. static int
  256. json_parse_number(const unsigned char **ucp, const unsigned char *ue,
  257. size_t lvl __file_debugused)
  258. {
  259. const unsigned char *uc = *ucp;
  260. int got = 0;
  261. DPRINTF("Parse number: ", uc, *ucp);
  262. if (uc == ue)
  263. return 0;
  264. if (*uc == '-')
  265. uc++;
  266. for (; uc < ue; uc++) {
  267. if (!json_isdigit(*uc))
  268. break;
  269. got = 1;
  270. }
  271. if (uc == ue)
  272. goto out;
  273. if (*uc == '.')
  274. uc++;
  275. for (; uc < ue; uc++) {
  276. if (!json_isdigit(*uc))
  277. break;
  278. got = 1;
  279. }
  280. if (uc == ue)
  281. goto out;
  282. if (got && (*uc == 'e' || *uc == 'E')) {
  283. uc++;
  284. got = 0;
  285. if (uc == ue)
  286. goto out;
  287. if (*uc == '+' || *uc == '-')
  288. uc++;
  289. for (; uc < ue; uc++) {
  290. if (!json_isdigit(*uc))
  291. break;
  292. got = 1;
  293. }
  294. }
  295. out:
  296. if (!got)
  297. DPRINTF("Bad number: ", uc, *ucp);
  298. else
  299. DPRINTF("Good number: ", uc, *ucp);
  300. *ucp = uc;
  301. return got;
  302. }
  303. /*ARGSUSED*/
  304. static int
  305. json_parse_const(const unsigned char **ucp, const unsigned char *ue,
  306. const char *str, size_t len, size_t lvl __file_debugused)
  307. {
  308. const unsigned char *uc = *ucp;
  309. DPRINTF("Parse const: ", uc, *ucp);
  310. *ucp += --len - 1;
  311. if (*ucp > ue)
  312. *ucp = ue;
  313. for (; uc < ue && --len;) {
  314. if (*uc++ != *++str) {
  315. DPRINTF("Bad const: ", uc, *ucp);
  316. return 0;
  317. }
  318. }
  319. DPRINTF("Good const: ", uc, *ucp);
  320. return 1;
  321. }
  322. static int
  323. json_parse(const unsigned char **ucp, const unsigned char *ue,
  324. size_t *st, size_t lvl)
  325. {
  326. const unsigned char *uc, *ouc;
  327. int rv = 0;
  328. int t;
  329. ouc = uc = json_skip_space(*ucp, ue);
  330. if (uc == ue)
  331. goto out;
  332. // Avoid recursion
  333. if (lvl > 500) {
  334. DPRINTF("Too many levels", uc, *ucp);
  335. return 0;
  336. }
  337. #if JSON_COUNT
  338. /* bail quickly if not counting */
  339. if (lvl > 1 && (st[JSON_OBJECT] || st[JSON_ARRAYN]))
  340. return 1;
  341. #endif
  342. DPRINTF("Parse general: ", uc, *ucp);
  343. switch (*uc++) {
  344. case '"':
  345. rv = json_parse_string(&uc, ue, lvl + 1);
  346. t = JSON_STRING;
  347. break;
  348. case '[':
  349. rv = json_parse_array(&uc, ue, st, lvl + 1);
  350. t = JSON_ARRAY;
  351. break;
  352. case '{': /* '}' */
  353. rv = json_parse_object(&uc, ue, st, lvl + 1);
  354. t = JSON_OBJECT;
  355. break;
  356. case 't':
  357. rv = json_parse_const(&uc, ue, "true", sizeof("true"), lvl + 1);
  358. t = JSON_CONSTANT;
  359. break;
  360. case 'f':
  361. rv = json_parse_const(&uc, ue, "false", sizeof("false"),
  362. lvl + 1);
  363. t = JSON_CONSTANT;
  364. break;
  365. case 'n':
  366. rv = json_parse_const(&uc, ue, "null", sizeof("null"), lvl + 1);
  367. t = JSON_CONSTANT;
  368. break;
  369. default:
  370. --uc;
  371. rv = json_parse_number(&uc, ue, lvl + 1);
  372. t = JSON_NUMBER;
  373. break;
  374. }
  375. if (rv)
  376. st[t]++;
  377. uc = json_skip_space(uc, ue);
  378. out:
  379. DPRINTF("End general: ", uc, *ucp);
  380. *ucp = uc;
  381. if (lvl == 0) {
  382. if (!rv)
  383. return 0;
  384. if (uc == ue)
  385. return (st[JSON_ARRAYN] || st[JSON_OBJECT]) ? 1 : 0;
  386. if (*ouc == *uc && json_parse(&uc, ue, st, 1))
  387. return (st[JSON_ARRAYN] || st[JSON_OBJECT]) ? 2 : 0;
  388. else
  389. return 0;
  390. }
  391. return rv;
  392. }
  393. #ifndef TEST
  394. int
  395. file_is_json(struct magic_set *ms, const struct buffer *b)
  396. {
  397. const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
  398. const unsigned char *ue = uc + b->flen;
  399. size_t st[JSON_MAX];
  400. int mime = ms->flags & MAGIC_MIME;
  401. int jt;
  402. if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
  403. return 0;
  404. memset(st, 0, sizeof(st));
  405. if ((jt = json_parse(&uc, ue, st, 0)) == 0)
  406. return 0;
  407. if (mime == MAGIC_MIME_ENCODING)
  408. return 1;
  409. if (mime) {
  410. if (file_printf(ms, "application/%s",
  411. jt == 1 ? "json" : "x-ndjson") == -1)
  412. return -1;
  413. return 1;
  414. }
  415. if (file_printf(ms, "%sJSON text data",
  416. jt == 1 ? "" : "New Line Delimited ") == -1)
  417. return -1;
  418. #if JSON_COUNT
  419. #define P(n) st[n], st[n] > 1 ? "s" : ""
  420. if (file_printf(ms, " (%" SIZE_T_FORMAT "u object%s, %" SIZE_T_FORMAT
  421. "u array%s, %" SIZE_T_FORMAT "u string%s, %" SIZE_T_FORMAT
  422. "u constant%s, %" SIZE_T_FORMAT "u number%s, %" SIZE_T_FORMAT
  423. "u >1array%s)",
  424. P(JSON_OBJECT), P(JSON_ARRAY), P(JSON_STRING), P(JSON_CONSTANT),
  425. P(JSON_NUMBER), P(JSON_ARRAYN))
  426. == -1)
  427. return -1;
  428. #endif
  429. return 1;
  430. }
  431. #else
  432. #include <sys/types.h>
  433. #include <sys/stat.h>
  434. #include <stdio.h>
  435. #include <fcntl.h>
  436. #include <unistd.h>
  437. #include <stdlib.h>
  438. #include <stdint.h>
  439. #include <err.h>
  440. int
  441. main(int argc, char *argv[])
  442. {
  443. int fd;
  444. struct stat st;
  445. unsigned char *p;
  446. size_t stats[JSON_MAX];
  447. if ((fd = open(argv[1], O_RDONLY)) == -1)
  448. err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
  449. if (fstat(fd, &st) == -1)
  450. err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
  451. if ((p = CAST(char *, malloc(st.st_size))) == NULL)
  452. err(EXIT_FAILURE, "Can't allocate %jd bytes",
  453. (intmax_t)st.st_size);
  454. if (read(fd, p, st.st_size) != st.st_size)
  455. err(EXIT_FAILURE, "Can't read %jd bytes",
  456. (intmax_t)st.st_size);
  457. memset(stats, 0, sizeof(stats));
  458. printf("is json %d\n", json_parse((const unsigned char **)&p,
  459. p + st.st_size, stats, 0));
  460. return 0;
  461. }
  462. #endif