EN
JavaScript - convert string to bytes array (UTF-8)
6 points
In this short article, we would like to show, how using JavaScript, convert string to UTF-8 bytes array.
This solution works under older web borsers and Node.js.
xxxxxxxxxx
1
const toBytes = (text) => {
2
const surrogate = encodeURIComponent(text);
3
const result = [];
4
for (let i = 0; i < surrogate.length;) {
5
const character = surrogate[i];
6
i += 1;
7
if (character === '%') {
8
const hex = surrogate.substring(i, i += 2);
9
if (hex) {
10
result.push(parseInt(hex, 16));
11
}
12
} else {
13
result.push(character.charCodeAt(0));
14
}
15
}
16
return result;
17
};
18
19
20
// Usage example:
21
22
const bytes = toBytes('Some text here...'); // converts string to UTF-8 bytes
23
24
console.log(bytes); // [83, 111, 109, 101, 32, 116, 101, 120, 116, 32, 104, 101, 114, 101, 46, 46, 46]
This solution appeard in the major web browsers around 2014-2020 and in Node.js v11.
xxxxxxxxxx
1
const encoder = new TextEncoder('UTF-8');
2
3
const toBytes = (text) => {
4
return encoder.encode(text);
5
};
6
7
8
// Usage example:
9
10
const bytes = toBytes('Some text here...'); // converts string to UTF-8 bytes
11
12
console.log(bytes); // [83, 111, 109, 101, 32, 116, 101, 120, 116, 32, 104, 101, 114, 101, 46, 46, 46]
This solution has quite good performance, it works in older web browsers and Node.js.
xxxxxxxxxx
1
const toBytes = (text) => {
2
const result = [];
3
for (let i = 0; i < text.length; i += 1) {
4
const hi = text.charCodeAt(i);
5
if (hi < 0x0080) {
6
// code point range: U+0000 - U+007F
7
// bytes: 0xxxxxxx
8
result.push(hi);
9
continue;
10
}
11
if (hi < 0x0800) {
12
// code point range: U+0080 - U+07FF
13
// bytes: 110xxxxx 10xxxxxx
14
result.push(0xC0 | hi >> 6,
15
0x80 | hi & 0x3F);
16
continue;
17
}
18
if (hi < 0xD800 || hi >= 0xE000 ) {
19
// code point range: U+0800 - U+FFFF
20
// bytes: 1110xxxx 10xxxxxx 10xxxxxx
21
result.push(0xE0 | hi >> 12,
22
0x80 | hi >> 6 & 0x3F,
23
0x80 | hi & 0x3F);
24
continue;
25
}
26
i += 1;
27
if (i < text.length) {
28
// surrogate pair
29
const lo = text.charCodeAt(i);
30
const code = 0x00010000 + (hi & 0x03FF) << 10 | lo & 0x03FF;
31
// code point range: U+10000 - U+10FFFF
32
// bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
33
result.push(0xF0 | code >> 18,
34
0x80 | code >> 12 & 0x3F,
35
0x80 | code >> 6 & 0x3F,
36
0x80 | code & 0x3F);
37
} else {
38
break;
39
}
40
}
41
return result;
42
};
43
44
45
// Usage example:
46
47
const bytes = toBytes('Some text here...'); // converts string to UTF-8 bytes
48
49
console.log(bytes); // [83, 111, 109, 101, 32, 116, 101, 120, 116, 32, 104, 101, 114, 101, 46, 46, 46]