Skip to content
Snippets Groups Projects
Commit b3d4acf9 authored by George Nachman's avatar George Nachman
Browse files

Fix bug where broken UTF-8 codes in tmux history threw an error. Now we just...

Fix bug where broken UTF-8 codes in tmux history threw an error. Now we just replace them with combining marks
parent 8817f3fa
No related branches found
No related tags found
No related merge requests found
Loading
Loading
@@ -34,6 +34,22 @@
 
#import <Foundation/Foundation.h>
 
// This is the standard unicode replacement character for when input couldn't
// be parsed properly but we need to render something there.
#define UNICODE_REPLACEMENT_CHAR 0xfffd
// Examine the leading UTF-8 sequence in a char array and check that it
// is properly encoded. Computes the number of bytes to use for the
// first code point. Returns the first code point, if it exists, in *result.
//
// Return value:
// positive: This many bytes compose a legal Unicode character.
// negative: abs(this many) bytes are illegal, should be replaced by one
// single replacement symbol.
// zero: Unfinished sequence, input needs to grow.
int decode_utf8_char(const unsigned char * restrict datap,
int datalen,
int * restrict result);
 
@interface NSString (iTerm)
 
Loading
Loading
@@ -59,4 +75,8 @@
// Convert a string of hex values (an even number of [0-9A-Fa-f]) into data.
- (NSData *)dataFromHexValues;
 
// Always returns a non-null vaule, but it may contain replacement chars for
// malformed utf-8 sequences.
- (NSString *)initWithUTF8DataIgnoringErrors:(NSData *)data;
@end
Loading
Loading
@@ -457,4 +457,98 @@ static int fromhex(unichar c) {
stringByReplacingOccurrencesOfString:@"\"" withString:@"\\\""];
}
 
int decode_utf8_char(const unsigned char *datap,
int datalen,
int * restrict result)
{
unsigned int theChar;
int utf8Length;
unsigned char c;
// This maps a utf-8 sequence length to the smallest code point it should
// encode (e.g., using 5 bytes to encode an ascii character would be
// considered an error).
unsigned int smallest[7] = { 0, 0, 0x80UL, 0x800UL, 0x10000UL, 0x200000UL, 0x4000000UL };
if (datalen == 0) {
return 0;
}
c = *datap;
if ((c & 0x80) == 0x00) {
*result = c;
return 1;
} else if ((c & 0xE0) == 0xC0) {
theChar = c & 0x1F;
utf8Length = 2;
} else if ((c & 0xF0) == 0xE0) {
theChar = c & 0x0F;
utf8Length = 3;
} else if ((c & 0xF8) == 0xF0) {
theChar = c & 0x07;
utf8Length = 4;
} else if ((c & 0xFC) == 0xF8) {
theChar = c & 0x03;
utf8Length = 5;
} else if ((c & 0xFE) == 0xFC) {
theChar = c & 0x01;
utf8Length = 6;
} else {
return -1;
}
for (int i = 1; i < utf8Length; i++) {
if (datalen <= i) {
return 0;
}
c = datap[i];
if ((c & 0xc0) != 0x80) {
// Expected a continuation character but did not get one.
return -i;
}
theChar = (theChar << 6) | (c & 0x3F);
}
if (theChar < smallest[utf8Length]) {
// Reject overlong sequences.
return -utf8Length;
}
*result = (int)theChar;
return utf8Length;
}
- (NSString *)initWithUTF8DataIgnoringErrors:(NSData *)data {
const unsigned char *p = data.bytes;
int len = data.length;
int utf8DecodeResult;
int theChar = 0;
NSMutableData *utf16Data = [NSMutableData data];
while (len > 0) {
utf8DecodeResult = decode_utf8_char(p, len, &theChar);
if (utf8DecodeResult == 0) {
// Stop on end of stream.
break;
} else if (utf8DecodeResult < 0) {
theChar = UNICODE_REPLACEMENT_CHAR;
utf8DecodeResult = -utf8DecodeResult;
} else if (theChar > 0xFFFF) {
// Convert to surrogate pair.
UniChar high, low;
high = ((theChar - 0x10000) >> 10) + 0xd800;
low = (theChar & 0x3ff) + 0xdc00;
[utf16Data appendBytes:&high length:sizeof(high)];
theChar = low;
}
UniChar c = theChar;
[utf16Data appendBytes:&c length:sizeof(c)];
p += utf8DecodeResult;
len -= utf8DecodeResult;
}
return [self initWithData:utf16Data encoding:NSUTF16LittleEndianStringEncoding];
}
@end
Loading
Loading
@@ -2337,7 +2337,7 @@ static NSString* FormatRect(NSRect r) {
 
- (NSString *)tmuxWindowName
{
return tmuxWindowName_ ? tmuxWindowName_ : @"tmux window";
return tmuxWindowName_ ? tmuxWindowName_ : @"tmux";
}
 
- (void)setTmuxWindowName:(NSString *)tmuxWindowName
Loading
Loading
Loading
Loading
@@ -30,6 +30,7 @@
*/
 
#import <Cocoa/Cocoa.h>
#import "NSStringITerm.h"
 
// This is used in the rightmost column when a double-width character would
// have been split in half and was wrapped to the next line. It is nonprintable
Loading
Loading
@@ -62,10 +63,7 @@
#define ITERM2_PRIVATE_BEGIN 0xf000
#define ITERM2_PRIVATE_END 0xf003
 
// This is the standard unicode replacement character for when input couldn't
// be parsed properly but we need to render something there.
#define UNICODE_REPLACEMENT_CHAR 0xfffd
#define ONECHAR_UNKNOWN ('?') // Used for encodings other than utf-8.
#define ONECHAR_UNKNOWN ('?') // Relacement character for encodings other than utf-8.
 
// Alternate semantics definitions
// Default background color
Loading
Loading
Loading
Loading
@@ -336,7 +336,7 @@ NSString* CharArrayToString(unichar* charHaystack, int o)
}
 
void DumpScreenCharArray(screen_char_t* screenChars, int lineLength) {
NSLog("%@", ScreenCharArrayToStringDebug(screenChars, lineLength));
NSLog(@"%@", ScreenCharArrayToStringDebug(screenChars, lineLength));
}
 
NSString* ScreenCharArrayToStringDebug(screen_char_t* screenChars,
Loading
Loading
Loading
Loading
@@ -9,6 +9,7 @@
#import "RegexKitLite.h"
#import "TmuxController.h"
#import "iTermApplicationDelegate.h"
#import "NSStringITerm.h"
 
NSString * const kTmuxGatewayErrorDomain = @"kTmuxGatewayErrorDomain";;
 
Loading
Loading
@@ -281,9 +282,9 @@ static NSString *kCommandIsLastInList = @"lastInList";
NSString *command = [[[NSString alloc] initWithData:[stream_ subdataWithRange:commandRange]
encoding:NSUTF8StringEncoding] autorelease];
if (!command) {
NSLog(@"Non-UTF-8 command in stream %@", [stream_ subdataWithRange:commandRange]);
[self abortWithErrorMessage:@"Non-UTF-8 command in stream (please copy hex data from Console.app into a bug report)"];
return NO;
// The command was not UTF-8. Unfortunately, this can happen. If tmux has a non-UTF-8
// character in a pane, it will just output it.
command = [[[NSString alloc] initWithUTF8DataIgnoringErrors:[stream_ subdataWithRange:commandRange]] autorelease];
}
// At least on osx, the terminal driver adds \r at random places, sometimes adding two of them in a row!
// We split on \n, which is safe, and just throw out any \r's that we see.
Loading
Loading
Loading
Loading
@@ -158,7 +158,6 @@ static VT100TCC decode_xterm(unsigned char *, int, int *,NSStringEncoding);
static VT100TCC decode_ansi(unsigned char *,int, int *,VT100Screen *);
static VT100TCC decode_other(unsigned char *, int, int *, NSStringEncoding);
static VT100TCC decode_control(unsigned char *, int, int *, NSStringEncoding, VT100Screen *, BOOL);
static int decode_utf8_char(unsigned char *, int, int *);
static VT100TCC decode_utf8(unsigned char *, int, int *);
static VT100TCC decode_euccn(unsigned char *, int, int *);
static VT100TCC decode_big5(unsigned char *,int, int *);
Loading
Loading
@@ -1872,74 +1871,6 @@ static VT100TCC decode_control(unsigned char *datap,
return result;
}
 
// Examine the leading UTF-8 sequence in a char array and check that it
// is properly encoded. Computes the number of bytes to use for the
// first code point.
//
// Return value:
// positive: This many bytes compose a legal Unicode character.
// negative: abs(this many) bytes are illegal, should be replaced by one
// single replacement symbol.
// zero: Unfinished sequence, input needs to grow.
static int decode_utf8_char(unsigned char *datap,
int datalen,
int *result)
{
unsigned int theChar;
int utf8Length;
unsigned char c;
// This maps a utf-8 sequence length to the smallest code point it should
// encode (e.g., using 5 bytes to encode an ascii character would be
// considered an error).
unsigned int smallest[7] = { 0, 0, 0x80UL, 0x800UL, 0x10000UL, 0x200000UL, 0x4000000UL };
if (datalen == 0) {
return 0;
}
c = *datap;
if ((c & 0x80) == 0x00) {
*result = c;
return 1;
} else if ((c & 0xE0) == 0xC0) {
theChar = c & 0x1F;
utf8Length = 2;
} else if ((c & 0xF0) == 0xE0) {
theChar = c & 0x0F;
utf8Length = 3;
} else if ((c & 0xF8) == 0xF0) {
theChar = c & 0x07;
utf8Length = 4;
} else if ((c & 0xFC) == 0xF8) {
theChar = c & 0x03;
utf8Length = 5;
} else if ((c & 0xFE) == 0xFC) {
theChar = c & 0x01;
utf8Length = 6;
} else {
return -1;
}
for (int i = 1; i < utf8Length; i++) {
if (datalen <= i) {
return 0;
}
c = datap[i];
if ((c & 0xc0) != 0x80) {
// Expected a continuation character but did not get one.
return -i;
}
theChar = (theChar << 6) | (c & 0x3F);
}
if (theChar < smallest[utf8Length]) {
// Reject overlong sequences.
return -utf8Length;
}
*result = (int)theChar;
return utf8Length;
}
static VT100TCC decode_utf8(unsigned char *datap,
int datalen,
int *rmlen)
Loading
Loading
This is an ancient greek numeral that looks like a christmas tree:
ð…
ð… bad char: £
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment