I need to cut the string - basically if the string if longer that 1 MB I should cut it to this size.
I am using these functions to check the string size
function __to_mb(bytes) {
return bytes / Math.pow(1024, 2)
}
function __size_mb(str) {
return __to_mb(Buffer.byteLength(str, 'utf8'))
}
Then I check the size of string like this
if (__size_mb(str) > 1) { /* do something */ }
But how to cut it?
A Javascript string consists of 16-bit sequences, with some characters using one 16-bit sequence and others needing two 16-bit sequences .
There is no easy way to just take an amount of bytes and consider it done - there might be a 2x 16-bit character at both sides of the cut-off location, which would then be cut in half.
To make a safe cut, we can use str.codePointAt(index)
which was introduced in ES2015. It knows which characters are 16-bit and which are 2x 16-bit. It combines either 1 or 2 of these 16-bit values into an integer result value.
codePointAt()
returns a value <= 2^16-1
then we have a 16-bit character at offset index
.codePointAt()
returns a value >= 2^16
then we have a 2x 16-bit character at offsets index
and index+1
.Unfortunately this means going through the entire string to assess each index. This may seem awkward, and it may even be slow, but I am not aware of a faster or smarter way of doing this.
Demo:
var str = "abç🔥😂déΩf👍g😏h"; // string of 13 characters console.log("str.length = " + str.length); // shows 17 because of double-width chars console.log("size in bytes = " + str.length * 2); // length * 2 gives size in bytes var maxByteLengths = [8, 16, 24, 32, 40]; for (var maxBytes of maxByteLengths) { var data = safeCutOff(str, maxBytes); console.log(maxBytes + " bytes -> " + data.text + " (" + data.bytes + " bytes)"); } function safeCutOff(str, maxBytes) { let widthInBytes = 0; for (var index = 0; index < str.length; /* index is incremented below */ ) { let positionsUsed = str.codePointAt(index) <= 0xFFFF ? 1 : 2; newWidthInBytes = widthInBytes + 2 * positionsUsed; if (newWidthInBytes > maxBytes) break; index += positionsUsed; widthInBytes = newWidthInBytes; } return { text: str.substring(0, index), bytes: widthInBytes }; }
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.