(svn r9354) [0.5] -Backport from trunk (r8975, r9003, r9011, r9012): 0.5
authorglx
Mon, 19 Mar 2007 21:42:05 +0000
branch0.5
changeset 5470 9fce095970bb
parent 5469 e0b2d7d37916
child 5471 d0fc5d926679
(svn r9354) [0.5] -Backport from trunk (r8975, r9003, r9011, r9012):
-Regression: [win32] Possible buffer overflow if unicode text is pasted into an input box and needs trimming. (r8975)
-Codechange: Introduce a function Utf8PrevCharLen that finds the starting character of an UTF-8 sequence from a given position and returns the length to the first UTF-8 encoding byte of that sequence. (r9003)
-Codechange: Rework Utf8PrevChar so that it returns a pointer to the previous UTF8 character's first byte instead of a byte-length offset (r9011)
-Fix: When cutting strings into multiple lines also take into consideration whitespace characters of more than 1 byte length (eg IDEOGRAPHIC SPACE, IsWhitespace() function). When trimming such strings, account for multiple-byte long sequences so use *Utf8PrevChar(v) = '\0'. (r9012)
-Codechange: Add a function Utf8TrimString() that properly trims a string to an UTF8 encoding seperation instead of somewhere in the wild (and use it in the chat area) (r9012)
gfx.c
misc_gui.c
string.c
string.h
texteff.c
win32.c
--- a/gfx.c	Mon Mar 19 21:04:06 2007 +0000
+++ b/gfx.c	Mon Mar 19 21:42:05 2007 +0000
@@ -270,7 +270,7 @@
 			if (w >= maxw) {
 				// string got too big... insert dotdotdot
 				ddd_pos[0] = ddd_pos[1] = ddd_pos[2] = '.';
-				ddd_pos[3] = 0;
+				ddd_pos[3] = '\0';
 				return ddd_w;
 			}
 		} else {
@@ -410,11 +410,12 @@
 	for (;;) {
 		char *last_space = NULL;
 		int w = 0;
+		char *s;
 
 		for (;;) {
 			WChar c = Utf8Consume((const char **)&str);
 			/* whitespace is where we will insert the line-break */
-			if (c == ' ') last_space = str;
+			if (IsWhitespace(c)) last_space = str;
 
 			if (IsPrintable(c)) {
 				w += GetCharacterWidth(size, c);
@@ -425,7 +426,7 @@
 				 * 2. In all other cases force a linebreak at the last seen whitespace */
 				if (w > maxw) {
 					if (last_space == NULL) {
-						str[-1] = '\0';
+						*Utf8PrevChar(str) = '\0';
 						return num + (size << 16);
 					}
 					str = last_space;
@@ -443,9 +444,17 @@
 			}
 		}
 end_of_inner_loop:
-		/* string didn't fit on line, so 'dummy' terminate and increase linecount */
+		/* String didn't fit on line (or a '\n' was encountered), so 'dummy' terminate
+		 * and increase linecount. We use Utf8PrevChar() as also non 1 char long
+		 * whitespace seperators are supported */
 		num++;
-		str[-1] = '\0';
+		s = Utf8PrevChar(str);
+		*s++ = '\0';
+
+		/* In which case (see above) we will shift remainder to left and close the gap */
+		if (str - s >= 1) {
+			for (; str[-1] != '\0';) *s++ = *str++;
+		}
 	}
 }
 
--- a/misc_gui.c	Mon Mar 19 21:04:06 2007 +0000
+++ b/misc_gui.c	Mon Mar 19 21:42:05 2007 +0000
@@ -794,21 +794,21 @@
 	WChar c;
 	uint width;
 	size_t len;
+	char *s = tb->buf + tb->caretpos;
 
-	if (backspace) {
-		do {
-			tb->caretpos--;
-		} while (IsUtf8Part(*(tb->buf + tb->caretpos)));
-	}
+	if (backspace) s = Utf8PrevChar(s);
 
-	len = Utf8Decode(&c, tb->buf + tb->caretpos);
+	len = Utf8Decode(&c, s);
 	width = GetCharacterWidth(FS_NORMAL, c);
 
 	tb->width  -= width;
-	if (backspace) tb->caretxoffs -= width;
+	if (backspace) {
+		tb->caretpos   -= len;
+		tb->caretxoffs -= width;
+	}
 
 	/* Move the remaining characters over the marker */
-	memmove(tb->buf + tb->caretpos, tb->buf + tb->caretpos + len, tb->length - tb->caretpos - len + 1);
+	memmove(s, s + len, tb->length - (s - tb->buf) - len + 1);
 	tb->length -= len;
 }
 
@@ -881,12 +881,9 @@
 	case WKC_LEFT:
 		if (tb->caretpos != 0) {
 			WChar c;
-
-			do {
-				tb->caretpos--;
-			} while (IsUtf8Part(*(tb->buf + tb->caretpos)));
-
-			Utf8Decode(&c, tb->buf + tb->caretpos);
+			const char *s = Utf8PrevChar(tb->buf + tb->caretpos);
+			Utf8Decode(&c, s);
+			tb->caretpos    = s - tb->buf; // -= (tb->buf + tb->caretpos - s)
 			tb->caretxoffs -= GetCharacterWidth(FS_NORMAL, c);
 
 			return true;
--- a/string.c	Mon Mar 19 21:04:06 2007 +0000
+++ b/string.c	Mon Mar 19 21:42:05 2007 +0000
@@ -269,3 +269,29 @@
 	*buf = '?';
 	return 1;
 }
+
+/**
+ * Properly terminate an UTF8 string to some maximum length
+ * @param s string to check if it needs additional trimming
+ * @param maxlen the maximum length the buffer can have.
+ * @return the new length in bytes of the string (eg. strlen(new_string))
+ * @NOTE maxlen is the string length _INCLUDING_ the terminating '\0'
+ */
+size_t Utf8TrimString(char *s, size_t maxlen)
+{
+	size_t length = 0;
+	const char *ptr = strchr(s, '\0');
+	while (*s != '\0') {
+		size_t len = Utf8EncodedCharLen(*s);
+		if (len == 0) break; // invalid encoding
+
+		/* Take care when a hard cutoff was made for the string and
+		 * the last UTF8 sequence is invalid */
+		if (length + len >= maxlen || (s + len > ptr)) break;
+		s += len;
+		length += len;
+	}
+
+	*s = '\0';
+	return length;
+}
--- a/string.h	Mon Mar 19 21:04:06 2007 +0000
+++ b/string.h	Mon Mar 19 21:42:05 2007 +0000
@@ -71,6 +71,7 @@
 
 size_t Utf8Decode(WChar *c, const char *s);
 size_t Utf8Encode(char *buf, WChar c);
+size_t Utf8TrimString(char *s, size_t maxlen);
 
 
 static inline WChar Utf8Consume(const char **s)
@@ -97,12 +98,43 @@
 }
 
 
+/**
+ * Return the length of an UTF-8 encoded value based on a single char. This
+ * char should be the first byte of the UTF-8 encoding. If not, or encoding
+ * is invalid, return value is 0
+ */
+static inline size_t Utf8EncodedCharLen(char c)
+{
+	if (GB(c, 3, 5) == 0x1E) return 4;
+	if (GB(c, 4, 4) == 0x0E) return 3;
+	if (GB(c, 5, 3) == 0x06) return 2;
+	if (GB(c, 7, 1) == 0x00) return 1;
+
+	/* Invalid UTF8 start encoding */
+	return 0;
+}
+
+
 /* Check if the given character is part of a UTF8 sequence */
 static inline bool IsUtf8Part(char c)
 {
 	return GB(c, 6, 2) == 2;
 }
 
+/**
+ * Retrieve the previous UNICODE character in an UTF-8 encoded string.
+ * @param s char pointer pointing to (the first char of) the next character
+ * @returns a pointer in 's' to the previous UNICODE character's first byte
+ * @note The function should not be used to determine the length of the previous
+ * encoded char because it might be an invalid/corrupt start-sequence
+ */
+static inline char *Utf8PrevChar(const char *s)
+{
+	const char *ret = s;
+	while (IsUtf8Part(*--ret));
+	return (char*)ret;
+}
+
 
 static inline bool IsPrintable(WChar c)
 {
@@ -112,5 +144,20 @@
 	return true;
 }
 
+/**
+ * Check whether UNICODE character is whitespace or not
+ * @param c UNICODE character to check
+ * @return a boolean value whether 'c' is a whitespace character or not
+ * @see http://www.fileformat.info/info/unicode/category/Zs/list.htm
+ */
+static inline bool IsWhitespace(WChar c)
+{
+	return
+	  c == 0x0020 /* SPACE */ ||
+	  c == 0x00A0 /* NO-BREAK SPACE */ ||
+	  c == 0x3000 /* IDEOGRAPHIC SPACE */
+	;
+}
+
 
 #endif /* STRING_H */
--- a/texteff.c	Mon Mar 19 21:04:06 2007 +0000
+++ b/texteff.c	Mon Mar 19 21:42:05 2007 +0000
@@ -27,7 +27,7 @@
 	uint32 params_2;
 } TextEffect;
 
-#define MAX_TEXTMESSAGE_LENGTH 150
+#define MAX_TEXTMESSAGE_LENGTH 200
 
 typedef struct TextMessage {
 	char message[MAX_TEXTMESSAGE_LENGTH];
@@ -77,6 +77,9 @@
 	vsnprintf(buf, lengthof(buf), message, va);
 	va_end(va);
 
+
+	Utf8TrimString(buf, MAX_TEXTMESSAGE_LENGTH);
+
 	/* Force linebreaks for strings that are too long */
 	lines = GB(FormatStringLinebreaks(buf, _textmsg_box.width - 8), 0, 16) + 1;
 	if (lines >= MAX_CHAT_MESSAGES) return;
--- a/win32.c	Mon Mar 19 21:04:06 2007 +0000
+++ b/win32.c	Mon Mar 19 21:42:05 2007 +0000
@@ -995,16 +995,18 @@
 	width = length = 0;
 
 	for (ptr = utf8_buf; (c = Utf8Consume(&ptr)) != '\0';) {
+		size_t len;
 		byte charwidth;
-
 		if (!IsPrintable(c)) break;
-		if (tb->length + length >= tb->maxlength - 1) break;
+
+		len = Utf8CharLen(c);
+		if (tb->length + length >= tb->maxlength - (uint16)len) break;
+
 		charwidth = GetCharacterWidth(FS_NORMAL, c);
-
 		if (tb->maxwidth != 0 && width + tb->width + charwidth > tb->maxwidth) break;
 
 		width += charwidth;
-		length += Utf8CharLen(c);
+		length += len;
 	}
 
 	if (length == 0) return false;
@@ -1016,6 +1018,7 @@
 
 	tb->length += length;
 	tb->caretpos += length;
+	assert(tb->length < tb->maxlength);
 	tb->buf[tb->length] = '\0'; // terminating zero
 
 	return true;