123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- Subject: PR/61: tmc: Add UCS-32 built-in detection
- Origin: FILE5_35-56-gf0a26da7 <https://github.com/file/file/commit/FILE5_35-56-gf0a26da7>
- Upstream-Author: Christos Zoulas <christos@zoulas.com>
- Date: Tue Feb 19 20:30:35 2019 +0000
- Comment: Prerequisite for FILE5_36-1-gecca6e54
- --- a/src/encoding.c
- +++ b/src/encoding.c
- @@ -49,6 +49,7 @@
- size_t *);
- private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
- private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
- +private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
- private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
- private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
- private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
- @@ -116,6 +117,15 @@
- DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
- *code = "UTF-8 Unicode";
- *code_mime = "utf-8";
- + } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
- + if (ucs_type == 1) {
- + *code = "Little-endian UTF-32 Unicode";
- + *code_mime = "utf-32le";
- + } else {
- + *code = "Big-endian UTF-32 Unicode";
- + *code_mime = "utf-32be";
- + }
- + DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
- } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
- if (ucs_type == 1) {
- *code = "Little-endian UTF-16 Unicode";
- @@ -410,7 +420,7 @@
- }
-
- private int
- -looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
- +looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
- size_t *ulen)
- {
- int bigend;
- @@ -419,9 +429,9 @@
- if (nbytes < 2)
- return 0;
-
- - if (buf[0] == 0xff && buf[1] == 0xfe)
- + if (bf[0] == 0xff && bf[1] == 0xfe)
- bigend = 0;
- - else if (buf[0] == 0xfe && buf[1] == 0xff)
- + else if (bf[0] == 0xfe && bf[1] == 0xff)
- bigend = 1;
- else
- return 0;
- @@ -432,20 +442,58 @@
- /* XXX fix to properly handle chars > 65536 */
-
- if (bigend)
- - ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
- + ubf[(*ulen)++] = bf[i + 1] + 256 * bf[i];
- else
- - ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
- + ubf[(*ulen)++] = bf[i] + 256 * bf[i + 1];
-
- - if (ubuf[*ulen - 1] == 0xfffe)
- + if (ubf[*ulen - 1] == 0xfffe)
- return 0;
- - if (ubuf[*ulen - 1] < 128 &&
- - text_chars[(size_t)ubuf[*ulen - 1]] != T)
- + if (ubf[*ulen - 1] < 128 &&
- + text_chars[(size_t)ubf[*ulen - 1]] != T)
- return 0;
- }
-
- return 1 + bigend;
- }
-
- +private int
- +looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
- + size_t *ulen)
- +{
- + int bigend;
- + size_t i;
- +
- + if (nbytes < 4)
- + return 0;
- +
- + if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
- + bigend = 0;
- + else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
- + bigend = 1;
- + else
- + return 0;
- +
- + *ulen = 0;
- +
- + for (i = 4; i + 1 < nbytes; i += 4) {
- + /* XXX fix to properly handle chars > 65536 */
- +
- + if (bigend)
- + ubf[(*ulen)++] = bf[i + 3] | (bf[i + 2] << 8)
- + | (bf[i + 1] << 16) | bf[i] << 24;
- + else
- + ubf[(*ulen)++] = bf[i] | (bf[i + 1] << 8)
- + | (bf[i + 2] << 16) | (bf[i + 3] << 24);
- +
- + if (ubf[*ulen - 1] == 0xfffe)
- + return 0;
- + if (ubf[*ulen - 1] < 128 &&
- + text_chars[(size_t)ubf[*ulen - 1]] != T)
- + return 0;
- + }
- +
- + return 1 + bigend;
- +}
- #undef F
- #undef T
- #undef I
|