Fix bug where broken UTF-8 codes in tmux history threw an error. Now we just...

Fix bug where broken UTF-8 codes in tmux history threw an error. Now we just replace them with combining marks

Fix bug where broken UTF-8 codes in tmux history threw an error. Now we just...
b3d4acf9 · George Nachman · 8817f3fa · b3d4acf9 · b3d4acf9 · b3d4acf9
Commit b3d4acf9 authored 12 years ago by George Nachman
--- a/NSStringITerm.h
+++ b/NSStringITerm.h
@@ -34,6 +34,22 @@
  
 #import <Foundation/Foundation.h>
  
+// This is the standard unicode replacement character for when input couldn't
+// be parsed properly but we need to render something there.
+#define UNICODE_REPLACEMENT_CHAR 0xfffd
+
+// Examine the leading UTF-8 sequence in a char array and check that it
+// is properly encoded. Computes the number of bytes to use for the
+// first code point. Returns the first code point, if it exists, in *result.
+//
+// Return value:
+// positive: This many bytes compose a legal Unicode character.
+// negative: abs(this many) bytes are illegal, should be replaced by one
+//   single replacement symbol.
+// zero: Unfinished sequence, input needs to grow.
+int decode_utf8_char(const unsigned char * restrict datap,
+                     int datalen,
+                     int * restrict result);
  
 @interface NSString (iTerm)
  
@@ -59,4 +75,8 @@
 // Convert a string of hex values (an even number of [0-9A-Fa-f]) into data.
 - (NSData *)dataFromHexValues;
  
+// Always returns a non-null vaule, but it may contain replacement chars for
+// malformed utf-8 sequences.
+- (NSString *)initWithUTF8DataIgnoringErrors:(NSData *)data;
+
 @end
--- a/NSStringITerm.m
+++ b/NSStringITerm.m
@@ -457,4 +457,98 @@ static int fromhex(unichar c) {
               stringByReplacingOccurrencesOfString:@"\"" withString:@"\\\""];
 }
  
+int decode_utf8_char(const unsigned char *datap,
+                     int datalen,
+                     int * restrict result)
+{
+    unsigned int theChar;
+    int utf8Length;
+    unsigned char c;
+    // This maps a utf-8 sequence length to the smallest code point it should
+    // encode (e.g., using 5 bytes to encode an ascii character would be
+    // considered an error).
+    unsigned int smallest[7] = { 0, 0, 0x80UL, 0x800UL, 0x10000UL, 0x200000UL, 0x4000000UL };
+
+    if (datalen == 0) {
+        return 0;
+    }
+
+    c = *datap;
+    if ((c & 0x80) == 0x00) {
+        *result = c;
+        return 1;
+    } else if ((c & 0xE0) == 0xC0) {
+        theChar = c & 0x1F;
+        utf8Length = 2;
+    } else if ((c & 0xF0) == 0xE0) {
+        theChar = c & 0x0F;
+        utf8Length = 3;
+    } else if ((c & 0xF8) == 0xF0) {
+        theChar = c & 0x07;
+        utf8Length = 4;
+    } else if ((c & 0xFC) == 0xF8) {
+        theChar = c & 0x03;
+        utf8Length = 5;
+    } else if ((c & 0xFE) == 0xFC) {
+        theChar = c & 0x01;
+        utf8Length = 6;
+    } else {
+        return -1;
+    }
+    for (int i = 1; i < utf8Length; i++) {
+        if (datalen <= i) {
+            return 0;
+        }
+        c = datap[i];
+        if ((c & 0xc0) != 0x80) {
+            // Expected a continuation character but did not get one.
+            return -i;
+        }
+        theChar = (theChar << 6) | (c & 0x3F);
+    }
+
+    if (theChar < smallest[utf8Length]) {
+        // Reject overlong sequences.
+        return -utf8Length;
+    }
+
+    *result = (int)theChar;
+    return utf8Length;
+}
+
+- (NSString *)initWithUTF8DataIgnoringErrors:(NSData *)data {
+    const unsigned char *p = data.bytes;
+    int len = data.length;
+    int utf8DecodeResult;
+    int theChar = 0;
+    NSMutableData *utf16Data = [NSMutableData data];
+
+    while (len > 0) {
+        utf8DecodeResult = decode_utf8_char(p, len, &theChar);
+        if (utf8DecodeResult == 0) {
+            // Stop on end of stream.
+            break;
+        } else if (utf8DecodeResult < 0) {
+            theChar = UNICODE_REPLACEMENT_CHAR;
+            utf8DecodeResult = -utf8DecodeResult;
+        } else if (theChar > 0xFFFF) {
+            // Convert to surrogate pair.
+           UniChar high, low;
+           high = ((theChar - 0x10000) >> 10) + 0xd800;
+           low = (theChar & 0x3ff) + 0xdc00;
+
+           [utf16Data appendBytes:&high length:sizeof(high)];
+           theChar = low;
+        }
+
+        UniChar c = theChar;
+        [utf16Data appendBytes:&c length:sizeof(c)];
+
+        p += utf8DecodeResult;
+        len -= utf8DecodeResult;
+    }
+
+    return [self initWithData:utf16Data encoding:NSUTF16LittleEndianStringEncoding];
+}
+
 @end
--- a/PTYTab.m
+++ b/PTYTab.m
@@ -2337,7 +2337,7 @@ static NSString* FormatRect(NSRect r) {
  
 - (NSString *)tmuxWindowName
 {
-    return tmuxWindowName_ ? tmuxWindowName_ : @"tmux window";
+    return tmuxWindowName_ ? tmuxWindowName_ : @"tmux";
 }
  
 - (void)setTmuxWindowName:(NSString *)tmuxWindowName

--- a/ScreenChar.h
+++ b/ScreenChar.h
@@ -30,6 +30,7 @@
 */
  
 #import <Cocoa/Cocoa.h>
+#import "NSStringITerm.h"
  
 // This is used in the rightmost column when a double-width character would
 // have been split in half and was wrapped to the next line. It is nonprintable
@@ -62,10 +63,7 @@
 #define ITERM2_PRIVATE_BEGIN 0xf000
 #define ITERM2_PRIVATE_END 0xf003
  
-// This is the standard unicode replacement character for when input couldn't
-// be parsed properly but we need to render something there.
-#define UNICODE_REPLACEMENT_CHAR 0xfffd
-#define ONECHAR_UNKNOWN ('?')   // Used for encodings other than utf-8.
+#define ONECHAR_UNKNOWN ('?')   // Relacement character for encodings other than utf-8.
  
 // Alternate semantics definitions
 // Default background color

--- a/ScreenChar.m
+++ b/ScreenChar.m
@@ -336,7 +336,7 @@ NSString* CharArrayToString(unichar* charHaystack, int o)
 }
  
 void DumpScreenCharArray(screen_char_t* screenChars, int lineLength) {
-    NSLog("%@", ScreenCharArrayToStringDebug(screenChars, lineLength));
+    NSLog(@"%@", ScreenCharArrayToStringDebug(screenChars, lineLength));
 }
  
 NSString* ScreenCharArrayToStringDebug(screen_char_t* screenChars,

--- a/TmuxGateway.m
+++ b/TmuxGateway.m
@@ -9,6 +9,7 @@
 #import "RegexKitLite.h"
 #import "TmuxController.h"
 #import "iTermApplicationDelegate.h"
+#import "NSStringITerm.h"
  
 NSString * const kTmuxGatewayErrorDomain = @"kTmuxGatewayErrorDomain";;
  
@@ -281,9 +282,9 @@ static NSString *kCommandIsLastInList = @"lastInList";
    NSString *command = [[[NSString alloc] initWithData:[stream_ subdataWithRange:commandRange]
                                               encoding:NSUTF8StringEncoding] autorelease];
    if (!command) {
-        NSLog(@"Non-UTF-8 command in stream %@", [stream_ subdataWithRange:commandRange]);
-        [self abortWithErrorMessage:@"Non-UTF-8 command in stream (please copy hex data from Console.app into a bug report)"];
-        return NO;
+        // The command was not UTF-8. Unfortunately, this can happen. If tmux has a non-UTF-8
+        // character in a pane, it will just output it.
+        command = [[[NSString alloc] initWithUTF8DataIgnoringErrors:[stream_ subdataWithRange:commandRange]] autorelease];
    }
    // At least on osx, the terminal driver adds \r at random places, sometimes adding two of them in a row!
    // We split on \n, which is safe, and just throw out any \r's that we see.

--- a/VT100Terminal.m
+++ b/VT100Terminal.m
@@ -158,7 +158,6 @@ static VT100TCC decode_xterm(unsigned char *, int, int *,NSStringEncoding);
 static VT100TCC decode_ansi(unsigned char *,int, int *,VT100Screen *);
 static VT100TCC decode_other(unsigned char *, int, int *, NSStringEncoding);
 static VT100TCC decode_control(unsigned char *, int, int *, NSStringEncoding, VT100Screen *, BOOL);
-static int decode_utf8_char(unsigned char *, int, int *);
 static VT100TCC decode_utf8(unsigned char *, int, int *);
 static VT100TCC decode_euccn(unsigned char *, int, int *);
 static VT100TCC decode_big5(unsigned char *,int, int *);
@@ -1872,74 +1871,6 @@ static VT100TCC decode_control(unsigned char *datap,
    return result;
 }
  
-// Examine the leading UTF-8 sequence in a char array and check that it
-// is properly encoded. Computes the number of bytes to use for the
-// first code point.
-//
-// Return value:
-// positive: This many bytes compose a legal Unicode character.
-// negative: abs(this many) bytes are illegal, should be replaced by one
-//   single replacement symbol.
-// zero: Unfinished sequence, input needs to grow.
-static int decode_utf8_char(unsigned char *datap,
-                            int datalen,
-                            int *result)
-{
-    unsigned int theChar;
-    int utf8Length;
-    unsigned char c;
-    // This maps a utf-8 sequence length to the smallest code point it should
-    // encode (e.g., using 5 bytes to encode an ascii character would be
-    // considered an error).
-    unsigned int smallest[7] = { 0, 0, 0x80UL, 0x800UL, 0x10000UL, 0x200000UL, 0x4000000UL };
-
-    if (datalen == 0) {
-        return 0;
-    }
-
-    c = *datap;
-    if ((c & 0x80) == 0x00) {
-        *result = c;
-        return 1;
-    } else if ((c & 0xE0) == 0xC0) {
-        theChar = c & 0x1F;
-        utf8Length = 2;
-    } else if ((c & 0xF0) == 0xE0) {
-        theChar = c & 0x0F;
-        utf8Length = 3;
-    } else if ((c & 0xF8) == 0xF0) {
-        theChar = c & 0x07;
-        utf8Length = 4;
-    } else if ((c & 0xFC) == 0xF8) {
-        theChar = c & 0x03;
-        utf8Length = 5;
-    } else if ((c & 0xFE) == 0xFC) {
-        theChar = c & 0x01;
-        utf8Length = 6;
-    } else {
-        return -1;
-    }
-    for (int i = 1; i < utf8Length; i++) {
-        if (datalen <= i) {
-            return 0;
-        }
-        c = datap[i];
-        if ((c & 0xc0) != 0x80) {
-            // Expected a continuation character but did not get one.
-            return -i;
-        }
-        theChar = (theChar << 6) | (c & 0x3F);
-    }
-
-    if (theChar < smallest[utf8Length]) {
-        // Reject overlong sequences.
-        return -utf8Length;
-    }
-
-    *result = (int)theChar;
-    return utf8Length;
-}
-
 static VT100TCC decode_utf8(unsigned char *datap,
                            int datalen,
                            int *rmlen)

--- a/tests/surrogatepair.txt
+++ b/tests/surrogatepair.txt
 This is an ancient greek numeral that looks like a christmas tree:
-ð…
+ð… bad char: £