Commit 496d1882 authored by Olly Betts's avatar Olly Betts
Browse files

Handle 4-byte UTF-8 sequences

Previously 'hop' and 'next' handled sequences of any length, but
commands which look at the character value only handled sequences up to
length 3.

Fixes #89.
parent f5d9bcf7
......@@ -64,31 +64,49 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
/* Code for character groupings: utf8 cases */
static int get_utf8(const symbol * p, int c, int l, int * slot) {
int b0, b1;
int b0, b1, b2;
if (c >= l) return 0;
b0 = p[c++];
if (b0 < 0xC0 || c == l) { /* 1100 0000 */
* slot = b0; return 1;
*slot = b0;
return 1;
}
b1 = p[c++];
b1 = p[c++] & 0x3F;
if (b0 < 0xE0 || c == l) { /* 1110 0000 */
* slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
*slot = (b0 & 0x1F) << 6 | b1;
return 2;
}
* slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
b2 = p[c++] & 0x3F;
if (b0 < 0xF0 || c == l) { /* 1111 0000 */
*slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
return 3;
}
*slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
return 4;
}
static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
int b0, b1;
int a, b;
if (c <= lb) return 0;
b0 = p[--c];
if (b0 < 0x80 || c == lb) { /* 1000 0000 */
* slot = b0; return 1;
b = p[--c];
if (b < 0x80 || c == lb) { /* 1000 0000 */
*slot = b;
return 1;
}
a = b & 0x3F;
b = p[--c];
if (b >= 0xC0 || c == lb) { /* 1100 0000 */
*slot = (b & 0x1F) << 6 | a;
return 2;
}
b1 = p[--c];
if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
* slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
a |= (b & 0x3F) << 6;
b = p[--c];
if (b >= 0xE0 || c == lb) { /* 1110 0000 */
*slot = (b & 0xF) << 12 | a;
return 3;
}
* slot = (p[--c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
*slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a;
return 4;
}
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment