EN
JavaScript - convert string to bytes array (UTF-8)
6
points
In this short article, we would like to show, how using JavaScript, convert string to UTF-8 bytes array.
Practical examples
1. Custom solution
This solution works under older web borsers and Node.js.
// ONLINE-RUNNER:browser;
const toBytes = (text) => {
const surrogate = encodeURIComponent(text);
const result = [];
for (let i = 0; i < surrogate.length;) {
const character = surrogate[i];
i += 1;
if (character === '%') {
const hex = surrogate.substring(i, i += 2);
if (hex) {
result.push(parseInt(hex, 16));
}
} else {
result.push(character.charCodeAt(0));
}
}
return result;
};
// Usage example:
const bytes = toBytes('Some text here...'); // converts string to UTF-8 bytes
console.log(bytes); // [83, 111, 109, 101, 32, 116, 101, 120, 116, 32, 104, 101, 114, 101, 46, 46, 46]
2. Embedded solution
This solution appeard in the major web browsers around 2014-2020 and in Node.js v11.
// ONLINE-RUNNER:browser;
const encoder = new TextEncoder('UTF-8');
const toBytes = (text) => {
return encoder.encode(text);
};
// Usage example:
const bytes = toBytes('Some text here...'); // converts string to UTF-8 bytes
console.log(bytes); // [83, 111, 109, 101, 32, 116, 101, 120, 116, 32, 104, 101, 114, 101, 46, 46, 46]
3. Optimal solution
This solution has quite good performance, it works in older web browsers and Node.js.
// ONLINE-RUNNER:browser;
const toBytes = (text) => {
const result = [];
for (let i = 0; i < text.length; i += 1) {
const hi = text.charCodeAt(i);
if (hi < 0x0080) {
// code point range: U+0000 - U+007F
// bytes: 0xxxxxxx
result.push(hi);
continue;
}
if (hi < 0x0800) {
// code point range: U+0080 - U+07FF
// bytes: 110xxxxx 10xxxxxx
result.push(0xC0 | hi >> 6,
0x80 | hi & 0x3F);
continue;
}
if (hi < 0xD800 || hi >= 0xE000 ) {
// code point range: U+0800 - U+FFFF
// bytes: 1110xxxx 10xxxxxx 10xxxxxx
result.push(0xE0 | hi >> 12,
0x80 | hi >> 6 & 0x3F,
0x80 | hi & 0x3F);
continue;
}
i += 1;
if (i < text.length) {
// surrogate pair
const lo = text.charCodeAt(i);
const code = 0x00010000 + (hi & 0x03FF) << 10 | lo & 0x03FF;
// code point range: U+10000 - U+10FFFF
// bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
result.push(0xF0 | code >> 18,
0x80 | code >> 12 & 0x3F,
0x80 | code >> 6 & 0x3F,
0x80 | code & 0x3F);
} else {
break;
}
}
return result;
};
// Usage example:
const bytes = toBytes('Some text here...'); // converts string to UTF-8 bytes
console.log(bytes); // [83, 111, 109, 101, 32, 116, 101, 120, 116, 32, 104, 101, 114, 101, 46, 46, 46]