// Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file // for details. All rights reserved. Use of this source code is governed by a // BSD-style license that can be found in the LICENSE file. part of "dart:convert"; /// The Unicode Replacement character `U+FFFD` (�). const int unicodeReplacementCharacterRune = 0xFFFD; /// The Unicode Byte Order Marker (BOM) character `U+FEFF`. const int unicodeBomCharacterRune = 0xFEFF; /// An instance of the default implementation of the [Utf8Codec]. /// /// This instance provides a convenient access to the most common UTF-8 /// use cases. /// /// Examples: /// ```dart /// var encoded = utf8.encode("Îñţérñåţîöñåļîžåţîờñ"); /// var decoded = utf8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, /// 0x72, 0x67, 0x72, 0xc3, 0xb8, 0x64]); /// ``` const Utf8Codec utf8 = Utf8Codec(); /// A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes /// UTF-8 code units to strings. final class Utf8Codec extends Encoding { final bool _allowMalformed; /// Instantiates a new [Utf8Codec]. /// /// The optional [allowMalformed] argument defines how [decoder] (and [decode]) /// deal with invalid or unterminated character sequences. /// /// If it is `true` (and not overridden at the method invocation) [decode] and /// the [decoder] replace invalid (or unterminated) octet /// sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise /// they throw a [FormatException]. const Utf8Codec({bool allowMalformed = false}) : _allowMalformed = allowMalformed; /// The name of this codec is "utf-8". String get name => "utf-8"; /// Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the /// corresponding string. /// /// If the [codeUnits] start with the encoding of a /// [unicodeBomCharacterRune], that character is discarded. /// /// If [allowMalformed] is `true`, the decoder replaces invalid (or /// unterminated) character sequences with the Unicode Replacement character /// `U+FFFD` (�). Otherwise it throws a [FormatException]. /// /// If [allowMalformed] is not given, it defaults to the `allowMalformed` that /// was used to instantiate `this`. String decode(List codeUnits, {bool? allowMalformed}) { // Switch between const objects to avoid allocation. Utf8Decoder decoder = allowMalformed ?? _allowMalformed ? const Utf8Decoder(allowMalformed: true) : const Utf8Decoder(allowMalformed: false); return decoder.convert(codeUnits); } /// Encodes the [string] as UTF-8. Uint8List encode(String string) { return const Utf8Encoder().convert(string); } Utf8Encoder get encoder => const Utf8Encoder(); Utf8Decoder get decoder { // Switch between const objects to avoid allocation. return _allowMalformed ? const Utf8Decoder(allowMalformed: true) : const Utf8Decoder(allowMalformed: false); } } /// This class converts strings to their UTF-8 code units (a list of /// unsigned 8-bit integers). /// /// Example: /// ```dart /// final utf8Encoder = utf8.encoder; /// const sample = 'Îñţérñåţîöñåļîžåţîờñ'; /// final encodedSample = utf8Encoder.convert(sample); /// print(encodedSample); /// ``` final class Utf8Encoder extends Converter> { const Utf8Encoder(); /// Converts [string] to its UTF-8 code units (a list of /// unsigned 8-bit integers). /// /// If [start] and [end] are provided, only the substring /// `string.substring(start, end)` is converted. /// /// Any unpaired surrogate character (`U+D800`-`U+DFFF`) in the input string /// is encoded as a Unicode Replacement character `U+FFFD` (�). Uint8List convert(String string, [int start = 0, int? end]) { var stringLength = string.length; end = RangeError.checkValidRange(start, end, stringLength); var length = end - start; if (length == 0) return Uint8List(0); // Create a new encoder with a length that is guaranteed to be big enough. // A single code unit uses at most 3 bytes, a surrogate pair at most 4. var encoder = _Utf8Encoder.withBufferSize(length * 3); var endPosition = encoder._fillBuffer(string, start, end); assert(endPosition >= end - 1); if (endPosition != end) { // Encoding skipped the last code unit. // That can only happen if the last code unit is a leadsurrogate. // Force encoding of the lead surrogate by itself. var lastCodeUnit = string.codeUnitAt(end - 1); assert(_isLeadSurrogate(lastCodeUnit)); // Write a replacement character to represent the unpaired surrogate. encoder._writeReplacementCharacter(); } return encoder._buffer.sublist(0, encoder._bufferIndex); } /// Starts a chunked conversion. /// /// The converter works more efficiently if the given [sink] is a /// [ByteConversionSink]. StringConversionSink startChunkedConversion(Sink> sink) { return _Utf8EncoderSink( sink is ByteConversionSink ? sink : ByteConversionSink.from(sink), ); } // Override the base-classes bind, to provide a better type. Stream> bind(Stream stream) => super.bind(stream); } /// This class encodes Strings to UTF-8 code units (unsigned 8 bit integers). // TODO(floitsch): make this class public. class _Utf8Encoder { int _carry = 0; int _bufferIndex = 0; final Uint8List _buffer; static const _DEFAULT_BYTE_BUFFER_SIZE = 1024; _Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE); _Utf8Encoder.withBufferSize(int bufferSize) : _buffer = _createBuffer(bufferSize); /// Allow an implementation to pick the most efficient way of storing bytes. static Uint8List _createBuffer(int size) => Uint8List(size); /// Write a replacement character (U+FFFD). Used for unpaired surrogates. void _writeReplacementCharacter() { _buffer[_bufferIndex++] = 0xEF; _buffer[_bufferIndex++] = 0xBF; _buffer[_bufferIndex++] = 0xBD; } /// Tries to combine the given [leadingSurrogate] with the [nextCodeUnit] and /// writes it to [_buffer]. /// /// Returns true if the [nextCodeUnit] was combined with the /// [leadingSurrogate]. If it wasn't, then nextCodeUnit was not a trailing /// surrogate and has not been written yet. /// /// It is safe to pass 0 for [nextCodeUnit], in which case a replacement /// character is written to represent the unpaired lead surrogate. bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) { if (_isTailSurrogate(nextCodeUnit)) { var rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit); // If the rune is encoded with 2 code-units then it must be encoded // with 4 bytes in UTF-8. assert(rune > _THREE_BYTE_LIMIT); assert(rune <= _FOUR_BYTE_LIMIT); _buffer[_bufferIndex++] = 0xF0 | (rune >> 18); _buffer[_bufferIndex++] = 0x80 | ((rune >> 12) & 0x3f); _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); return true; } else { // Unpaired lead surrogate. _writeReplacementCharacter(); return false; } } /// Fills the [_buffer] with as many characters as possible. /// /// Does not encode any trailing lead-surrogate. This must be done by the /// caller. /// /// Returns the position in the string. The returned index points to the /// first code unit that hasn't been encoded. int _fillBuffer(String str, int start, int end) { if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) { // Don't handle a trailing lead-surrogate in this loop. The caller has // to deal with those. end--; } int stringIndex; for (stringIndex = start; stringIndex < end; stringIndex++) { var codeUnit = str.codeUnitAt(stringIndex); // ASCII has the same representation in UTF-8 and UTF-16. if (codeUnit <= _ONE_BYTE_LIMIT) { if (_bufferIndex >= _buffer.length) break; _buffer[_bufferIndex++] = codeUnit; } else if (_isLeadSurrogate(codeUnit)) { if (_bufferIndex + 4 > _buffer.length) break; // Note that it is safe to read the next code unit. We decremented // [end] above when the last valid code unit was a leading surrogate. var nextCodeUnit = str.codeUnitAt(stringIndex + 1); var wasCombined = _writeSurrogate(codeUnit, nextCodeUnit); if (wasCombined) stringIndex++; } else if (_isTailSurrogate(codeUnit)) { if (_bufferIndex + 3 > _buffer.length) break; // Unpaired tail surrogate. _writeReplacementCharacter(); } else { var rune = codeUnit; if (rune <= _TWO_BYTE_LIMIT) { if (_bufferIndex + 1 >= _buffer.length) break; _buffer[_bufferIndex++] = 0xC0 | (rune >> 6); _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); } else { assert(rune <= _THREE_BYTE_LIMIT); if (_bufferIndex + 2 >= _buffer.length) break; _buffer[_bufferIndex++] = 0xE0 | (rune >> 12); _buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f); _buffer[_bufferIndex++] = 0x80 | (rune & 0x3f); } } } return stringIndex; } } /// This class encodes chunked strings to UTF-8 code units (unsigned 8-bit /// integers). class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSink { final ByteConversionSink _sink; _Utf8EncoderSink(this._sink); void close() { if (_carry != 0) { // addSlice will call close again, but then the carry must be equal to 0. addSlice("", 0, 0, true); return; } _sink.close(); } void addSlice(String str, int start, int end, bool isLast) { _bufferIndex = 0; if (start == end && !isLast) { return; } if (_carry != 0) { var nextCodeUnit = 0; if (start != end) { nextCodeUnit = str.codeUnitAt(start); } else { assert(isLast); } var wasCombined = _writeSurrogate(_carry, nextCodeUnit); // Either we got a non-empty string, or we must not have been combined. assert(!wasCombined || start != end); if (wasCombined) start++; _carry = 0; } do { start = _fillBuffer(str, start, end); var isLastSlice = isLast && (start == end); if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) { if (isLast && _bufferIndex < _buffer.length - 3) { // There is still space for the replacement character to represent // the last incomplete surrogate. _writeReplacementCharacter(); } else { // Otherwise store it in the carry. If isLast is true, then // close will flush the last carry. _carry = str.codeUnitAt(start); } start++; } _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice); _bufferIndex = 0; } while (start < end); if (isLast) close(); } // TODO(floitsch): implement asUtf8Sink. Slightly complicated because it // needs to deal with malformed input. } /// This class converts UTF-8 code units (lists of unsigned 8-bit integers) /// to a string. /// /// Example: /// ```dart /// final utf8Decoder = utf8.decoder; /// const encodedBytes = [ /// 195, 142, 195, 177, 197, 163, 195, 169, 114, 195, 177, 195, 165, 197, /// 163, 195, 174, 195, 182, 195, 177, 195, 165, 196, 188, 195, 174, 197, /// 190, 195, 165, 197, 163, 195, 174, 225, 187, 157, 195, 177]; /// /// final decodedBytes = utf8Decoder.convert(encodedBytes); /// print(decodedBytes); // Îñţérñåţîöñåļîžåţîờñ /// ``` /// Throws a [FormatException] if the encoded input contains /// invalid UTF-8 byte sequences and [allowMalformed] is `false` (the default). /// /// If [allowMalformed] is `true`, invalid byte sequences are converted into /// one or more Unicode replacement characters, U+FFFD ('�'). /// /// Example with `allowMalformed` set to true: /// ```dart /// const utf8Decoder = Utf8Decoder(allowMalformed: true); /// const encodedBytes = [0xFF]; /// final decodedBytes = utf8Decoder.convert(encodedBytes); /// print(decodedBytes); // � /// ``` final class Utf8Decoder extends Converter, String> { final bool _allowMalformed; /// Instantiates a new [Utf8Decoder]. /// /// The optional [allowMalformed] argument defines how [convert] deals /// with invalid or unterminated character sequences. /// /// If it is `true`, [convert] replaces invalid (or unterminated) character /// sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise /// it throws a [FormatException]. const Utf8Decoder({bool allowMalformed = false}) : _allowMalformed = allowMalformed; /// Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the /// corresponding string. /// /// Uses the code units from [start] to, but not including, [end]. /// If [end] is omitted, it defaults to `codeUnits.length`. /// /// If the [codeUnits] start with the encoding of a /// [unicodeBomCharacterRune], that character is discarded. String convert(List codeUnits, [int start = 0, int? end]) => _Utf8Decoder(_allowMalformed).convertSingle(codeUnits, start, end); /// Starts a chunked conversion. /// /// The converter works more efficiently if the given [sink] is a /// [StringConversionSink]. ByteConversionSink startChunkedConversion(Sink sink) { StringConversionSink stringSink; if (sink is StringConversionSink) { stringSink = sink; } else { stringSink = StringConversionSink.from(sink); } return stringSink.asUtf8Sink(_allowMalformed); } // Override the base-classes bind, to provide a better type. Stream bind(Stream> stream) => super.bind(stream); external Converter, T> fuse(Converter next); } // UTF-8 constants. const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. // UTF-16 constants. const int _SURROGATE_TAG_MASK = 0xFC00; const int _SURROGATE_VALUE_MASK = 0x3FF; const int _LEAD_SURROGATE_MIN = 0xD800; const int _TAIL_SURROGATE_MIN = 0xDC00; bool _isLeadSurrogate(int codeUnit) => (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; bool _isTailSurrogate(int codeUnit) => (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; int _combineSurrogatePair(int lead, int tail) => 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) | (tail & _SURROGATE_VALUE_MASK); class _Utf8Decoder { /// Decode malformed UTF-8 as replacement characters (instead of throwing)? final bool allowMalformed; /// Decoder DFA state. int _state; /// Partially decoded character. Meaning depends on state. Not used when in /// the initial/accept state. When in an error state, contains the index into /// the input of the error. int _charOrIndex = 0; // State machine for UTF-8 decoding, based on this decoder by Björn Höhrmann: // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ // // One iteration in the state machine proceeds as: // // type = typeTable[byte]; // char = (state != accept) // ? (byte & 0x3F) | (char << 6) // : byte & (shiftedByteMask >> type); // state = transitionTable[state + type]; // // After each iteration, if state == accept, char is output as a character. // Mask to and on the type read from the table. static const int typeMask = 0x1F; // Mask shifted right by byte type to mask first byte of sequence. static const int shiftedByteMask = 0xF0FE; // Byte types. // 'A' = ASCII, 00-7F // 'B' = 2-byte, C2-DF // 'C' = 3-byte, E1-EC, EE // 'D' = 3-byte (possibly surrogate), ED // 'E' = Illegal, C0-C1, F5+ // 'F' = Low extension, 80-8F // 'G' = Mid extension, 90-9F // 'H' = High extension, A0-BA, BC-BE // 'I' = Second byte of BOM, BB // 'J' = Third byte of BOM, BF // 'K' = 3-byte (possibly overlong), E0 // 'L' = First byte of BOM, EF // 'M' = 4-byte (possibly out-of-range), F4 // 'N' = 4-byte, F1-F3 // 'O' = 4-byte (possibly overlong), F0 static const String typeTable = "" "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 00-1F "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 20-3F "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 40-5F "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" // 60-7F "FFFFFFFFFFFFFFFFGGGGGGGGGGGGGGGG" // 80-9F "HHHHHHHHHHHHHHHHHHHHHHHHHHHIHHHJ" // A0-BF "EEBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB" // C0-DF "KCCCCCCCCCCCCDCLONNNMEEEEEEEEEEE" // E0-FF ; // States (offsets into transition table). static const int IA = 0x00; // Initial / Accept static const int BB = 0x10; // Before BOM static const int AB = 0x20; // After BOM static const int X1 = 0x30; // Expecting one extension byte static const int X2 = 0x3A; // Expecting two extension bytes static const int X3 = 0x44; // Expecting three extension bytes static const int TO = 0x4E; // Possibly overlong 3-byte static const int TS = 0x58; // Possibly surrogate static const int QO = 0x62; // Possibly overlong 4-byte static const int QR = 0x6C; // Possibly out-of-range 4-byte static const int B1 = 0x76; // One byte into BOM static const int B2 = 0x80; // Two bytes into BOM static const int E1 = 0x41; // Error: Missing extension byte static const int E2 = 0x43; // Error: Unexpected extension byte static const int E3 = 0x45; // Error: Invalid byte static const int E4 = 0x47; // Error: Overlong encoding static const int E5 = 0x49; // Error: Out of range static const int E6 = 0x4B; // Error: Surrogate static const int E7 = 0x4D; // Error: Unfinished // Character equivalents for states. static const String _IA = '\u0000'; static const String _BB = '\u0010'; static const String _AB = '\u0020'; static const String _X1 = '\u0030'; static const String _X2 = '\u003A'; static const String _X3 = '\u0044'; static const String _TO = '\u004E'; static const String _TS = '\u0058'; static const String _QO = '\u0062'; static const String _QR = '\u006C'; static const String _B1 = '\u0076'; static const String _B2 = '\u0080'; static const String _E1 = '\u0041'; static const String _E2 = '\u0043'; static const String _E3 = '\u0045'; static const String _E4 = '\u0047'; static const String _E5 = '\u0049'; static const String _E6 = '\u004B'; static const String _E7 = '\u004D'; // Transition table of the state machine. Maps state and byte type // to next state. static const String transitionTable = " " // A B C D E F G H I J K L M N O "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_X2$_QR$_X3$_QO " // IA "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_B1$_QR$_X3$_QO " // BB "$_IA$_X1$_X2$_TS$_E3$_E2$_E2$_E2$_E2$_E2$_TO$_X2$_QR$_X3$_QO " // AB "$_E1$_E1$_E1$_E1$_E1$_IA$_IA$_IA$_IA$_IA" // Overlap 5 E1s X1 "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_X1$_X1$_X1" // Overlap 5 E1s X2 "$_E1$_E1$_E1$_E1$_E1$_X2$_X2$_X2$_X2$_X2" // Overlap 5 E1s X3 "$_E1$_E1$_E1$_E1$_E1$_E4$_E4$_X1$_X1$_X1" // Overlap 5 E1s TO "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_E6$_E6$_E6" // Overlap 5 E1s TS "$_E1$_E1$_E1$_E1$_E1$_E4$_X2$_X2$_X2$_X2" // Overlap 5 E1s QO "$_E1$_E1$_E1$_E1$_E1$_X2$_E5$_E5$_E5$_E5" // Overlap 5 E1s QR "$_E1$_E1$_E1$_E1$_E1$_X1$_X1$_X1$_B2$_X1" // Overlap 5 E1s B1 "$_E1$_E1$_E1$_E1$_E1$_IA$_IA$_IA$_IA$_AB$_E1$_E1$_E1$_E1$_E1" // B2 ; // Aliases for states. static const int initial = IA; static const int accept = IA; static const int beforeBom = BB; static const int afterBom = AB; static const int errorMissingExtension = E1; static const int errorUnexpectedExtension = E2; static const int errorInvalid = E3; static const int errorOverlong = E4; static const int errorOutOfRange = E5; static const int errorSurrogate = E6; static const int errorUnfinished = E7; @pragma("vm:prefer-inline") @pragma("wasm:prefer-inline") static bool isErrorState(int state) => (state & 1) != 0; static String errorDescription(int state) { switch (state) { case errorMissingExtension: return "Missing extension byte"; case errorUnexpectedExtension: return "Unexpected extension byte"; case errorInvalid: return "Invalid UTF-8 byte"; case errorOverlong: return "Overlong encoding"; case errorOutOfRange: return "Out of unicode range"; case errorSurrogate: return "Encoded surrogate"; case errorUnfinished: return "Unfinished UTF-8 octet sequence"; default: return ""; } } external _Utf8Decoder(bool allowMalformed); external String convertSingle(List codeUnits, int start, int? maybeEnd); external String convertChunked(List codeUnits, int start, int? maybeEnd); /// Whether the decoder expects the continuation of unfinished multi-byte /// UTF8-encoding. bool get expectsContinuation => _state > afterBom; /// Flushes this decoder as if closed. /// /// This method throws if the input was partial and the decoder was /// constructed with `allowMalformed` set to `false`. void flush(StringSink sink) { final int state = _state; _state = initial; if (state <= afterBom) { return; } // Unfinished sequence. if (allowMalformed) { sink.writeCharCode(unicodeReplacementCharacterRune); } else { throw FormatException(errorDescription(errorUnfinished), null, null); } } String decodeGeneral(Uint8List bytes, int start, int end, bool single) { final String typeTable = _Utf8Decoder.typeTable; final String transitionTable = _Utf8Decoder.transitionTable; int state = _state; int char = _charOrIndex; final StringBuffer buffer = StringBuffer(); int i = start; int byte = bytes[i++]; loop: while (true) { multibyte: while (true) { int type = typeTable.codeUnitAt(byte) & typeMask; char = (state <= afterBom) ? byte & (shiftedByteMask >> type) : (byte & 0x3F) | (char << 6); state = transitionTable.codeUnitAt(state + type); if (state == accept) { buffer.writeCharCode(char); if (i == end) break loop; break multibyte; } else if (isErrorState(state)) { if (allowMalformed) { switch (state) { case errorInvalid: case errorUnexpectedExtension: // A single byte that can't start a sequence. buffer.writeCharCode(unicodeReplacementCharacterRune); break; case errorMissingExtension: // Unfinished sequence followed by a byte that can start a // sequence. buffer.writeCharCode(unicodeReplacementCharacterRune); // Re-parse offending byte. i -= 1; break; default: // Unfinished sequence followed by a byte that can't start a // sequence. buffer.writeCharCode(unicodeReplacementCharacterRune); buffer.writeCharCode(unicodeReplacementCharacterRune); break; } state = initial; } else { _state = state; _charOrIndex = i - 1; return ""; } } if (i == end) break loop; byte = bytes[i++]; } final int markStart = i; byte = bytes[i++]; if (byte < 128) { int markEnd = end; while (i < end) { byte = bytes[i++]; if (byte >= 128) { markEnd = i - 1; break; } } assert(markStart < markEnd); if (markEnd - markStart < 20) { for (int m = markStart; m < markEnd; m++) { buffer.writeCharCode(bytes[m]); } } else { buffer.write(String.fromCharCodes(bytes, markStart, markEnd)); } if (markEnd == end) break loop; } } if (single && state > afterBom) { // Unfinished sequence. if (allowMalformed) { buffer.writeCharCode(unicodeReplacementCharacterRune); } else { _state = errorUnfinished; _charOrIndex = end; return ""; } } _state = state; _charOrIndex = char; return buffer.toString(); } static Uint8List _makeUint8List(List codeUnits, int start, int end) { final int length = end - start; final Uint8List bytes = Uint8List(length); for (int i = 0; i < length; i++) { int b = codeUnits[start + i]; if ((b & ~0xFF) != 0) { // Replace invalid byte values by FF, which is also invalid. b = 0xFF; } bytes[i] = b; } return bytes; } }