LCOV - lcov.info - src/charset

LCOV - code coverage report

Current view:	top level - src - charset_detector.dart (source / functions)		Coverage	Total	Hit
Test:	lcov.info	Lines:	100.0 %	46	46
Test Date:	2026-06-15 22:56:39	Functions:	-	0	0
Legend:	Lines: hit not hit

            Line data    Source code

       1              : // Copyright 2026 The Authors.
       2              : //
       3              : // Licensed under the Apache License, Version 2.0 (the "License");
       4              : // you may not use this file except in compliance with the License.
       5              : // You may obtain a copy of the License at
       6              : //
       7              : //     https://www.apache.org/licenses/LICENSE-2.0
       8              : //
       9              : // Unless required by applicable law or agreed to in writing, software
      10              : // distributed under the License is distributed on an "AS IS" BASIS,
      11              : // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      12              : // See the License for the specific language governing permissions and
      13              : // limitations under the License.
      14              : 
      15              : import 'dart:convert';
      16              : import 'dart:typed_data';
      17              : 
      18              : import 'package:charset/charset.dart';
      19              : 
      20              : /// Maximum number of bytes sampled from the input for detection.
      21              : ///
      22              : /// Only the first [_sampleSize] bytes are examined. This avoids loading large
      23              : /// files into memory and is sufficient for reliable BOM and structural
      24              : /// detection. The sample boundary is a hard byte cut — it is not aligned to
      25              : /// a character boundary. If a multi-byte sequence spans the boundary, Stage 2
      26              : /// (UTF-8 structural validation) may reject an otherwise valid UTF-8 file and
      27              : /// fall through to the Stage 3 probe. This is accepted as a documented
      28              : /// trade-off: the edge case is vanishingly rare and the consequence (a
      29              : /// `windows-1252` result instead of `utf-8`) is mild.
      30              : const int _sampleSize = 8 * 1024; // 8 KB
      31              : 
      32              : /// Explicit mapping from each candidate [Encoding] instance to its canonical
      33              : /// lowercase IANA label.
      34              : ///
      35              : /// The detector always returns labels from this map — never [Encoding.name] —
      36              : /// because several codecs use non-hyphenated or otherwise non-IANA internal
      37              : /// names (e.g. `windows1252`, `latin-2`, `latin-9`). Owning this map makes
      38              : /// the public API contract stable and independent of any upstream name changes
      39              : /// in the `charset` package.
      40            3 : final Map<Encoding, String> _ianaLabels = {
      41            1 :   windows1252: 'windows-1252',
      42              :   latin1: 'iso-8859-1',
      43            1 :   latin2: 'iso-8859-2',
      44            1 :   latin9: 'iso-8859-15',
      45              :   shiftJis: 'shift-jis',
      46              :   eucJp: 'euc-jp',
      47              :   eucKr: 'euc-kr',
      48              :   gbk: 'gbk',
      49              : };
      50              : 
      51              : /// Candidate encodings to probe in Stage 3, partitioned into Western and CJK
      52              : /// groups. The probe order within each group is significant: the first
      53              : /// candidate that successfully decodes the sample wins. CJK candidates are
      54              : /// promoted to the front of the list when the sample contains a high
      55              : /// proportion of high bytes (see [_looksMultibyte]).
      56              : ///
      57              : /// Ordering rationale for Western candidates:
      58              : /// - `windows-1252` is first because it is the most common legacy Western
      59              : ///   encoding and accepts almost any byte sequence. Placing it first means
      60              : ///   that if no CJK encoding matches, `windows-1252` wins immediately;
      61              : ///   ISO-8859 variants follow to give them a chance before the fallback.
      62              : /// - `iso-8859-1` is a strict subset of `windows-1252` so it will match
      63              : ///   whenever `windows-1252` would, but it is listed here for completeness
      64              : ///   and to exercise the probe path in tests.
      65              : /// - `iso-8859-2` and `iso-8859-15` are more restrictive and can reject
      66              : ///   byte sequences that `windows-1252` accepts.
      67            3 : final List<Encoding> _westernCandidates = [
      68            1 :   windows1252,
      69              :   latin1, // iso-8859-1
      70            1 :   latin2, // iso-8859-2
      71            1 :   latin9, // iso-8859-15
      72              : ];
      73              : 
      74            3 : final List<Encoding> _cjkCandidates = [shiftJis, eucJp, eucKr, gbk];
      75              : 
      76              : /// Detects the character encoding of the given byte sequence.
      77              : ///
      78              : /// Returns a lowercase IANA encoding label string. The following labels may
      79              : /// be returned:
      80              : ///
      81              : /// - BOM-detected: `'utf-8'`, `'utf-16be'`, `'utf-16le'`, `'utf-32be'`,
      82              : ///   `'utf-32le'`
      83              : /// - UTF-8 structural: `'utf-8'`
      84              : /// - Legacy 8-bit: `'windows-1252'`, `'iso-8859-1'`, `'iso-8859-2'`,
      85              : ///   `'iso-8859-15'`
      86              : /// - CJK multi-byte: `'shift-jis'`, `'euc-jp'`, `'euc-kr'`, `'gbk'`
      87              : /// - Fallback: `'windows-1252'`
      88              : ///
      89              : /// Detection proceeds through three ordered stages, falling through only
      90              : /// when the current stage cannot make a determination:
      91              : ///
      92              : /// **Stage 1 — BOM inspection (deterministic)**
      93              : /// A byte-order mark (BOM), when present, is authoritative. The four-byte
      94              : /// UTF-32 BOMs are checked before the two-byte UTF-16 BOMs to prevent
      95              : /// UTF-32 LE from being misidentified as UTF-16 LE (they share the same
      96              : /// first two bytes: `FF FE`).
      97              : ///
      98              : /// **Stage 2 — UTF-8 structural validation**
      99              : /// A leading 8 KB sample of the input is decoded with
     100              : /// `utf8.decode(allowMalformed: false)`. A successful decode means the
     101              : /// content is UTF-8 (or pure ASCII, which is a strict UTF-8 subset). Empty
     102              : /// input passes this stage and returns `'utf-8'`.
     103              : ///
     104              : /// **Stage 3 — Candidate probe via the `charset` package**
     105              : /// The sample is tested against each candidate [Encoding] using the static
     106              : /// [Charset.canDecode] method. CJK encodings are promoted when more than 15%
     107              : /// of sample bytes are ≥ `0x80`. The first candidate to successfully decode
     108              : /// the sample wins. If no candidate matches, `'windows-1252'` is returned as
     109              : /// the fallback (following the WHATWG Encoding specification default).
     110              : ///
     111              : /// Note: [Charset.canDecode] considers a decode invalid if the resulting
     112              : /// string contains the Unicode replacement character U+FFFD (`'?'`). Input
     113              : /// that legitimately contains U+FFFD will therefore be rejected by every
     114              : /// non-UTF codec regardless of its actual encoding. This is a known
     115              : /// limitation of the structural validity approach.
     116              : ///
     117              : /// Example (web-safe — works on all platforms):
     118              : /// ```dart
     119              : /// import 'dart:typed_data';
     120              : /// import 'package:betto_charset_detector/betto_charset_detector.dart';
     121              : ///
     122              : /// void main() {
     123              : ///   // Bytes may come from an HTTP response, file picker, dart:io, etc.
     124              : ///   final bytes = Uint8List.fromList([0xEF, 0xBB, 0xBF, 104, 101, 108, 108, 111]);
     125              : ///   final encoding = detectCharset(bytes);
     126              : ///   print('Detected encoding: $encoding'); // utf-8
     127              : /// }
     128              : /// ```
     129              : ///
     130              : /// On native platforms only, you can read bytes from a file:
     131              : /// ```dart
     132              : /// import 'dart:io';
     133              : /// import 'package:betto_charset_detector/betto_charset_detector.dart';
     134              : ///
     135              : /// void main() {
     136              : ///   final bytes = File('data.csv').readAsBytesSync();
     137              : ///   final encoding = detectCharset(bytes);
     138              : ///   print('Detected encoding: $encoding');
     139              : /// }
     140              : /// ```
     141            1 : String detectCharset(Uint8List bytes) {
     142              :   // Extract a leading sample to limit memory usage.
     143              :   // The sample cap is applied first so that all subsequent stages operate
     144              :   // on the same bounded input.
     145            2 :   final sample = bytes.length > _sampleSize
     146            1 :       ? Uint8List.sublistView(bytes, 0, _sampleSize)
     147              :       : bytes;
     148              : 
     149              :   // Stage 1: BOM inspection.
     150            1 :   final bomResult = _detectBom(sample);
     151              :   if (bomResult != null) {
     152              :     return bomResult;
     153              :   }
     154              : 
     155              :   // Stage 2: UTF-8 structural validation.
     156            1 :   if (_isValidUtf8(sample)) {
     157              :     return 'utf-8';
     158              :   }
     159              : 
     160              :   // Stage 3: Candidate probe.
     161            1 :   return _probeEncoding(sample);
     162              : }
     163              : 
     164              : /// Returns the IANA encoding label indicated by a byte-order mark at the
     165              : /// start of [bytes], or `null` if no recognised BOM is present.
     166              : ///
     167              : /// Four-byte BOMs are checked before two-byte BOMs. This ordering is
     168              : /// essential to correctly distinguish UTF-32 LE (`FF FE 00 00`) from UTF-16
     169              : /// LE (`FF FE`), as the two sequences share the same first two bytes.
     170              : ///
     171              : /// | BOM bytes       | Returned label |
     172              : /// | :-------------- | :------------- |
     173              : /// | `00 00 FE FF`   | `'utf-32be'`   |
     174              : /// | `FF FE 00 00`   | `'utf-32le'`   |
     175              : /// | `EF BB BF`      | `'utf-8'`      |
     176              : /// | `FE FF`         | `'utf-16be'`   |
     177              : /// | `FF FE`         | `'utf-16le'`   |
     178            1 : String? _detectBom(Uint8List bytes) {
     179            1 :   final len = bytes.length;
     180              : 
     181              :   // Check 4-byte BOMs first to avoid misidentifying UTF-32 as UTF-16.
     182            1 :   if (len >= 4) {
     183              :     // UTF-32 BE BOM: 00 00 FE FF
     184            2 :     if (bytes[0] == 0x00 &&
     185            2 :         bytes[1] == 0x00 &&
     186            2 :         bytes[2] == 0xFE &&
     187            2 :         bytes[3] == 0xFF) {
     188              :       return 'utf-32be';
     189              :     }
     190              :     // UTF-32 LE BOM: FF FE 00 00
     191            2 :     if (bytes[0] == 0xFF &&
     192            2 :         bytes[1] == 0xFE &&
     193            2 :         bytes[2] == 0x00 &&
     194            2 :         bytes[3] == 0x00) {
     195              :       return 'utf-32le';
     196              :     }
     197              :   }
     198              : 
     199              :   // Check 3-byte BOMs.
     200            1 :   if (len >= 3) {
     201              :     // UTF-8 BOM: EF BB BF
     202            6 :     if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
     203              :       return 'utf-8';
     204              :     }
     205              :   }
     206              : 
     207              :   // Check 2-byte BOMs.
     208            1 :   if (len >= 2) {
     209              :     // UTF-16 BE BOM: FE FF
     210            4 :     if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
     211              :       return 'utf-16be';
     212              :     }
     213              :     // UTF-16 LE BOM: FF FE
     214            4 :     if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
     215              :       return 'utf-16le';
     216              :     }
     217              :   }
     218              : 
     219              :   return null;
     220              : }
     221              : 
     222              : /// Returns `true` if [bytes] can be successfully decoded as UTF-8.
     223              : ///
     224              : /// Uses [utf8.decode] with `allowMalformed: false`. Any [FormatException]
     225              : /// (invalid byte sequence) causes the method to return `false`. Note that
     226              : /// this is a structural validity check — it does not distinguish UTF-8 from
     227              : /// pure ASCII (ASCII is a strict subset of UTF-8 and will return `true`).
     228              : ///
     229              : /// An empty byte sequence returns `true` because it is vacuously valid UTF-8.
     230            1 : bool _isValidUtf8(Uint8List bytes) {
     231              :   try {
     232              :     // ignore: unnecessary_ignore
     233            1 :     utf8.decode(bytes, allowMalformed: false);
     234              :     return true;
     235            1 :   } on FormatException {
     236              :     return false;
     237              :   }
     238              : }
     239              : 
     240              : /// Returns `true` when more than 15% of the bytes in [sample] are ≥ `0x80`.
     241              : ///
     242              : /// This heuristic is used to promote CJK candidate encodings (which use
     243              : /// multi-byte sequences in the high byte range) to the front of the probe
     244              : /// list. A threshold of >15% is chosen to avoid false promotion on Western
     245              : /// 8-bit text that may have a small number of accented characters.
     246            1 : bool _looksMultibyte(Uint8List sample) {
     247            1 :   if (sample.isEmpty) return false;
     248              :   var highByteCount = 0;
     249            2 :   for (final b in sample) {
     250            2 :     if (b >= 0x80) highByteCount++;
     251              :   }
     252              :   // Promote CJK when more than 15% of bytes are in the high range.
     253            3 :   return highByteCount / sample.length > 0.15;
     254              : }
     255              : 
     256              : /// Probes [bytes] against each candidate encoding using
     257              : /// [Charset.canDecode] and returns the IANA label of the first matching
     258              : /// encoding.
     259              : ///
     260              : /// CJK candidates are moved to the front of the probe order when
     261              : /// [_looksMultibyte] returns `true` for [bytes]. This reduces false Western
     262              : /// matches on CJK content with dense high-byte sequences.
     263              : ///
     264              : /// The probe is a structural validity check: [Charset.canDecode] decodes the
     265              : /// sample and rejects it if the decoded string contains the Unicode
     266              : /// replacement character U+FFFD. This makes the probe reliable for CJK
     267              : /// multi-byte encodings (which have strict structural constraints) but less
     268              : /// discriminating among Western 8-bit encodings (which can accept almost any
     269              : /// byte sequence).
     270              : ///
     271              : /// Returns `'windows-1252'` if no candidate matches. This follows the WHATWG
     272              : /// Encoding specification fallback, as Windows-1252 is the most common legacy
     273              : /// Western encoding and a superset of ISO-8859-1.
     274            1 : String _probeEncoding(Uint8List bytes) {
     275              :   // Build the probe order based on whether the sample looks like multi-byte
     276              :   // (CJK) content.
     277            1 :   final candidates = _looksMultibyte(bytes)
     278            3 :       ? [..._cjkCandidates, ..._westernCandidates]
     279            3 :       : [..._westernCandidates, ..._cjkCandidates];
     280              : 
     281            2 :   for (final encoding in candidates) {
     282            1 :     if (Charset.canDecode(encoding, bytes)) {
     283              :       // Return the IANA label from the explicit map rather than encoding.name,
     284              :       // because some codecs use non-hyphenated or non-IANA internal names.
     285            2 :       return _ianaLabels[encoding] ?? encoding.name;
     286              :     }
     287              :   }
     288              : 
     289              :   // Fallback: windows-1252 per the WHATWG Encoding specification.
     290              :   return 'windows-1252';
     291              : }

Generated by: LCOV version 2.0-1