cbiedl
/
file


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							Subject: PR/61: tmc: Add UCS-32 built-in detection
Origin: FILE5_35-56-gf0a26da7 <https://github.com/file/file/commit/FILE5_35-56-gf0a26da7>
Upstream-Author: Christos Zoulas <christos@zoulas.com>
Date: Tue Feb 19 20:30:35 2019 +0000
Comment: Prerequisite for FILE5_36-1-gecca6e54

--- a/src/encoding.c
+++ b/src/encoding.c
@@ -49,6 +49,7 @@
     size_t *);
 private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
 private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
@@ -116,6 +117,15 @@
 		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
 		*code = "UTF-8 Unicode";
 		*code_mime = "utf-8";
+	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
+		if (ucs_type == 1) {
+			*code = "Little-endian UTF-32 Unicode";
+			*code_mime = "utf-32le";
+		} else {
+			*code = "Big-endian UTF-32 Unicode";
+			*code_mime = "utf-32be";
+		}
+		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
 		if (ucs_type == 1) {
 			*code = "Little-endian UTF-16 Unicode";
@@ -410,7 +420,7 @@
 }
 
 private int
-looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
     size_t *ulen)
 {
 	int bigend;
@@ -419,9 +429,9 @@
 	if (nbytes < 2)
 		return 0;
 
-	if (buf[0] == 0xff && buf[1] == 0xfe)
+	if (bf[0] == 0xff && bf[1] == 0xfe)
 		bigend = 0;
-	else if (buf[0] == 0xfe && buf[1] == 0xff)
+	else if (bf[0] == 0xfe && bf[1] == 0xff)
 		bigend = 1;
 	else
 		return 0;
@@ -432,20 +442,58 @@
 		/* XXX fix to properly handle chars > 65536 */
 
 		if (bigend)
-			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
+			ubf[(*ulen)++] = bf[i + 1] + 256 * bf[i];
 		else
-			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
+			ubf[(*ulen)++] = bf[i] + 256 * bf[i + 1];
 
-		if (ubuf[*ulen - 1] == 0xfffe)
+		if (ubf[*ulen - 1] == 0xfffe)
 			return 0;
-		if (ubuf[*ulen - 1] < 128 &&
-		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
+		if (ubf[*ulen - 1] < 128 &&
+		    text_chars[(size_t)ubf[*ulen - 1]] != T)
 			return 0;
 	}
 
 	return 1 + bigend;
 }
 
+private int
+looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
+    size_t *ulen)
+{
+	int bigend;
+	size_t i;
+
+	if (nbytes < 4)
+		return 0;
+
+	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
+		bigend = 0;
+	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
+		bigend = 1;
+	else
+		return 0;
+
+	*ulen = 0;
+
+	for (i = 4; i + 1 < nbytes; i += 4) {
+		/* XXX fix to properly handle chars > 65536 */
+
+		if (bigend)
+			ubf[(*ulen)++] = bf[i + 3] | (bf[i + 2] << 8)
+			    | (bf[i + 1] << 16) | bf[i] << 24;
+		else
+			ubf[(*ulen)++] = bf[i] | (bf[i + 1] << 8) 
+			    | (bf[i + 2] << 16) | (bf[i + 3] << 24);
+
+		if (ubf[*ulen - 1] == 0xfffe)
+			return 0;
+		if (ubf[*ulen - 1] < 128 &&
+		    text_chars[(size_t)ubf[*ulen - 1]] != T)
+			return 0;
+	}
+
+	return 1 + bigend;
+}
 #undef F
 #undef T
 #undef I