diff --git a/lib/unicode.js b/lib/unicode.js index 21bae12..f98d89a 100644 --- a/lib/unicode.js +++ b/lib/unicode.js @@ -532,12 +532,18 @@ exports.fromCodePoint = function() { return result; }; +/** + * Regexes + */ + +exports.chars = {}; + // Double width characters that are _not_ surrogate pairs. // NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this // regex anyway. This regex is used to put a blank char after wide chars to // be eaten, however, if this is a surrogate pair, parseContent already adds // the extra one char because its length equals 2 instead of 1. -exports.wideChars = new RegExp('([' +exports.chars.wide = new RegExp('([' + '\\u1100-\\u115f' // Hangul Jamo init. consonants + '\\u2329\\u232a' + '\\u2e80-\\u303e\\u3040-\\ua4cf' // CJK ... Yi @@ -550,21 +556,21 @@ exports.wideChars = new RegExp('([' + '])', 'g'); // All wide chars including surrogate pairs. -exports.allWideChars = new RegExp('(' +exports.chars.all = new RegExp('(' // 0x20000 - 0x2fffd: + '[\\ud840-\\ud87f][\\udc00-\\udffd]' + '|' // 0x30000 - 0x3fffd: + '[\\ud880-\\ud8bf][\\udc00-\\udffd]' + '|' - + exports.wideChars.source.slice(1, -1) + + exports.chars.wide.source.slice(1, -1) + ')', 'g'); // Regex to detect a surrogate pair. -exports.surrogateChars = /[\ud800-\udbff][\udc00-\udfff]/g; +exports.chars.surrogate = /[\ud800-\udbff][\udc00-\udfff]/g; // Regex to find combining characters. -exports.combiningChars = exports.combiningTable.reduce(function(out, row) { +exports.chars.combining = exports.combiningTable.reduce(function(out, row) { var low, high, range; if (row[0] > 0x00ffff) { low = exports.fromCodePoint(row[0]); @@ -591,7 +597,7 @@ exports.combiningChars = exports.combiningTable.reduce(function(out, row) { return out; }, '['); -exports.combiningChars = new RegExp(exports.combiningChars, 'g'); +exports.chars.combining = new RegExp(exports.chars.combining, 'g'); function hexify(n) { n = n.toString(16); @@ -600,7 +606,7 @@ function hexify(n) { } /* -exports.combiningChars = new RegExp( +exports.chars.combining = new RegExp( '[' + '\\u0300-\\u036f' + '\\u0483-\\u0486' diff --git a/lib/widget.js b/lib/widget.js index df097b0..1261e86 100644 --- a/lib/widget.js +++ b/lib/widget.js @@ -2424,17 +2424,17 @@ Element.prototype.parseContent = function(noTags) { if (this.screen.fullUnicode) { // double-width chars will eat the next char after render. create a // blank character after it so it doesn't eat the real next char. - content = content.replace(unicode.wideChars, '$1_'); + content = content.replace(unicode.chars.wide, '$1_'); } else { // no double-width: replace them with question-marks. - content = content.replace(unicode.allWideChars, '??'); + content = content.replace(unicode.chars.all, '??'); // delete combining characters since they're 0-width anyway. // NOTE: We could drop this, the non-surrogates would get changed to ? by // the unicode filter, and surrogates changed to ? by the surrogate // regex. however, the user might expect them to be 0-width. - content = content.replace(unicode.combiningChars, ''); + content = content.replace(unicode.chars.combining, ''); // no surrogate pairs: replace them with question-marks. - content = content.replace(unicode.surrogateChars, '?'); + content = content.replace(unicode.chars.surrogate, '?'); } if (!noTags) { @@ -2769,7 +2769,7 @@ main: } // Pad the end of the lines if the surrogate is not a double-width char. // var surrogates = out[i].length - punycode.ucs2.decode(out[i]).length; - var surrogates = out[i].match(unicode.surrogateChars); + var surrogates = out[i].match(unicode.chars.surrogate); if (surrogates && surrogates.length) { for (var j = 0; j < surrogates.length; j++) { var cwid = unicode.charWidth(surrogates[j], 0);