cherry-pick.FILE5_35-56-gf0a26da7.pr-61-tmc-add-ucs-32-built-in-detection.patch 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. Subject: PR/61: tmc: Add UCS-32 built-in detection
  2. Origin: FILE5_35-56-gf0a26da7 <https://github.com/file/file/commit/FILE5_35-56-gf0a26da7>
  3. Upstream-Author: Christos Zoulas <christos@zoulas.com>
  4. Date: Tue Feb 19 20:30:35 2019 +0000
  5. Comment: Prerequisite for FILE5_36-1-gecca6e54
  6. --- a/src/encoding.c
  7. +++ b/src/encoding.c
  8. @@ -49,6 +49,7 @@
  9. size_t *);
  10. private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
  11. private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
  12. +private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
  13. private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
  14. private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
  15. private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
  16. @@ -116,6 +117,15 @@
  17. DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
  18. *code = "UTF-8 Unicode";
  19. *code_mime = "utf-8";
  20. + } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
  21. + if (ucs_type == 1) {
  22. + *code = "Little-endian UTF-32 Unicode";
  23. + *code_mime = "utf-32le";
  24. + } else {
  25. + *code = "Big-endian UTF-32 Unicode";
  26. + *code_mime = "utf-32be";
  27. + }
  28. + DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
  29. } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
  30. if (ucs_type == 1) {
  31. *code = "Little-endian UTF-16 Unicode";
  32. @@ -410,7 +420,7 @@
  33. }
  34. private int
  35. -looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
  36. +looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
  37. size_t *ulen)
  38. {
  39. int bigend;
  40. @@ -419,9 +429,9 @@
  41. if (nbytes < 2)
  42. return 0;
  43. - if (buf[0] == 0xff && buf[1] == 0xfe)
  44. + if (bf[0] == 0xff && bf[1] == 0xfe)
  45. bigend = 0;
  46. - else if (buf[0] == 0xfe && buf[1] == 0xff)
  47. + else if (bf[0] == 0xfe && bf[1] == 0xff)
  48. bigend = 1;
  49. else
  50. return 0;
  51. @@ -432,20 +442,58 @@
  52. /* XXX fix to properly handle chars > 65536 */
  53. if (bigend)
  54. - ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
  55. + ubf[(*ulen)++] = bf[i + 1] + 256 * bf[i];
  56. else
  57. - ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
  58. + ubf[(*ulen)++] = bf[i] + 256 * bf[i + 1];
  59. - if (ubuf[*ulen - 1] == 0xfffe)
  60. + if (ubf[*ulen - 1] == 0xfffe)
  61. return 0;
  62. - if (ubuf[*ulen - 1] < 128 &&
  63. - text_chars[(size_t)ubuf[*ulen - 1]] != T)
  64. + if (ubf[*ulen - 1] < 128 &&
  65. + text_chars[(size_t)ubf[*ulen - 1]] != T)
  66. return 0;
  67. }
  68. return 1 + bigend;
  69. }
  70. +private int
  71. +looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
  72. + size_t *ulen)
  73. +{
  74. + int bigend;
  75. + size_t i;
  76. +
  77. + if (nbytes < 4)
  78. + return 0;
  79. +
  80. + if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
  81. + bigend = 0;
  82. + else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
  83. + bigend = 1;
  84. + else
  85. + return 0;
  86. +
  87. + *ulen = 0;
  88. +
  89. + for (i = 4; i + 1 < nbytes; i += 4) {
  90. + /* XXX fix to properly handle chars > 65536 */
  91. +
  92. + if (bigend)
  93. + ubf[(*ulen)++] = bf[i + 3] | (bf[i + 2] << 8)
  94. + | (bf[i + 1] << 16) | bf[i] << 24;
  95. + else
  96. + ubf[(*ulen)++] = bf[i] | (bf[i + 1] << 8)
  97. + | (bf[i + 2] << 16) | (bf[i + 3] << 24);
  98. +
  99. + if (ubf[*ulen - 1] == 0xfffe)
  100. + return 0;
  101. + if (ubf[*ulen - 1] < 128 &&
  102. + text_chars[(size_t)ubf[*ulen - 1]] != T)
  103. + return 0;
  104. + }
  105. +
  106. + return 1 + bigend;
  107. +}
  108. #undef F
  109. #undef T
  110. #undef I