[Edit]

JavaScript - convert string to bytes array (UTF-8)

1 contributors

2 contributions

0 discussions

6 points

Created by:

Eshaal-Wilkinson

774

In this short article, we would like to show, how using JavaScript, convert string to UTF-8 bytes array.

Practical examples

Edit

1. Custom solution

Edit

This solution works under older web borsers and Node.js.

xxxxxxxxxx
 
const toBytes = (text) => {
    const surrogate = encodeURIComponent(text);
    const result = [];
    for (let i = 0; i < surrogate.length;) {
        const character = surrogate[i];
        i += 1;
        if (character === '%') {
            const hex = surrogate.substring(i, i += 2);
            if (hex) {
                result.push(parseInt(hex, 16));
            }
        } else {
            result.push(character.charCodeAt(0));
        }
    }
    return result;
};
​
​
// Usage example:
​
const bytes = toBytes('Some text here...'); // converts string to UTF-8 bytes
​
console.log(bytes);  // [83, 111, 109, 101, 32, 116, 101, 120, 116, 32, 104, 101, 114, 101, 46, 46, 46]

Auto running

2. Embedded solution

Edit

This solution appeard in the major web browsers around 2014-2020 and in Node.js v11.

xxxxxxxxxx
 
const encoder = new TextEncoder('UTF-8');
​
const toBytes = (text) => {
    return encoder.encode(text);
};
​
​
// Usage example:
​
const bytes = toBytes('Some text here...'); // converts string to UTF-8 bytes
​
console.log(bytes);  // [83, 111, 109, 101, 32, 116, 101, 120, 116, 32, 104, 101, 114, 101, 46, 46, 46]

Auto running

3. Optimal solution

Edit

This solution has quite good performance, it works in older web browsers and Node.js.

xxxxxxxxxx
 
const toBytes = (text) => {
    const result = [];
    for (let i = 0; i < text.length; i += 1) {
        const hi = text.charCodeAt(i);
        if (hi < 0x0080) {
            // code point range: U+0000 - U+007F
            // bytes: 0xxxxxxx
            result.push(hi);
            continue;
        }
        if (hi < 0x0800) {
            // code point range: U+0080 - U+07FF
            // bytes: 110xxxxx 10xxxxxx
            result.push(0xC0 | hi >> 6,
                        0x80 | hi       & 0x3F);
            continue;
        }
        if (hi < 0xD800 || hi >= 0xE000 ) {
            // code point range: U+0800 - U+FFFF
            // bytes: 1110xxxx 10xxxxxx 10xxxxxx    
            result.push(0xE0 | hi >> 12,
                        0x80 | hi >>  6 & 0x3F,
                        0x80 | hi       & 0x3F);
            continue;
        }
        i += 1;
        if (i < text.length) {
            // surrogate pair
            const lo = text.charCodeAt(i);
            const code = 0x00010000 + (hi & 0x03FF) << 10 | lo & 0x03FF;
            // code point range: U+10000 - U+10FFFF
            // bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            result.push(0xF0 | code >> 18,
                        0x80 | code >> 12 & 0x3F,
                        0x80 | code >>  6 & 0x3F,
                        0x80 | code       & 0x3F);
        } else {
            break;
        }
    }
    return result;
};
​
​
// Usage example:
​
const bytes = toBytes('Some text here...'); // converts string to UTF-8 bytes
​
console.log(bytes);  // [83, 111, 109, 101, 32, 116, 101, 120, 116, 32, 104, 101, 114, 101, 46, 46, 46]