(svn r7185) -Codechange: Make strgen validate strings for UTF-8 well-formed-ness-ness
authorpeter1138
Fri, 17 Nov 2006 07:46:02 +0000
changeset 5111 05bb05c1e9b5
parent 5110 880a547b5717
child 5112 9bb62a7fc166
(svn r7185) -Codechange: Make strgen validate strings for UTF-8 well-formed-ness-ness
strgen/strgen.c
--- a/strgen/strgen.c	Fri Nov 17 07:35:12 2006 +0000
+++ b/strgen/strgen.c	Fri Nov 17 07:46:02 2006 +0000
@@ -222,6 +222,31 @@
 }
 
 
+size_t Utf8Validate(const char *s)
+{
+	uint32 c;
+
+	if (!HASBIT(s[0], 7)) {
+		/* 1 byte */
+		return 1;
+	} else if (GB(s[0], 5, 3) == 6 && IsUtf8Part(s[1])) {
+		/* 2 bytes */
+		c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
+		if (c >= 0x80) return 2;
+	} else if (GB(s[0], 4, 4) == 14 && IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
+		/* 3 bytes */
+		c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
+		if (c >= 0x800) return 3;
+	} else if (GB(s[0], 3, 5) == 30 && IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
+		/* 4 bytes */
+		c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
+		if (c >= 0x10000 && c <= 0x10FFFF) return 4;
+	}
+
+	return 0;
+}
+
+
 static void EmitSingleChar(char *buf, int value)
 {
 	if (*buf != '\0') warning("Ignoring trailing letters in command");
@@ -781,6 +806,16 @@
 	*t = 0;
 	s++;
 
+	/* Check string is valid UTF-8 */
+	{
+		const char *tmp;
+		for (tmp = s; *tmp != '\0';) {
+			size_t len = Utf8Validate(tmp);
+			if (len == 0) fatal("Invalid UTF-8 sequence in '%s'", s);
+			tmp += len;
+		}
+	}
+
 	// Check if the string has a case..
 	// The syntax for cases is IDENTNAME.case
 	casep = strchr(str, '.');