Line data Source code
1 : // Copyright 2026 The Authors.
2 : //
3 : // Licensed under the Apache License, Version 2.0 (the "License");
4 : // you may not use this file except in compliance with the License.
5 : // You may obtain a copy of the License at
6 : //
7 : // https://www.apache.org/licenses/LICENSE-2.0
8 : //
9 : // Unless required by applicable law or agreed to in writing, software
10 : // distributed under the License is distributed on an "AS IS" BASIS,
11 : // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 : // See the License for the specific language governing permissions and
13 : // limitations under the License.
14 :
15 : import 'dart:convert';
16 : import 'dart:typed_data';
17 :
18 : import 'package:charset/charset.dart';
19 :
20 : /// Maximum number of bytes sampled from the input for detection.
21 : ///
22 : /// Only the first [_sampleSize] bytes are examined. This avoids loading large
23 : /// files into memory and is sufficient for reliable BOM and structural
24 : /// detection. The sample boundary is a hard byte cut — it is not aligned to
25 : /// a character boundary. If a multi-byte sequence spans the boundary, Stage 2
26 : /// (UTF-8 structural validation) may reject an otherwise valid UTF-8 file and
27 : /// fall through to the Stage 3 probe. This is accepted as a documented
28 : /// trade-off: the edge case is vanishingly rare and the consequence (a
29 : /// `windows-1252` result instead of `utf-8`) is mild.
30 : const int _sampleSize = 8 * 1024; // 8 KB
31 :
32 : /// Explicit mapping from each candidate [Encoding] instance to its canonical
33 : /// lowercase IANA label.
34 : ///
35 : /// The detector always returns labels from this map — never [Encoding.name] —
36 : /// because several codecs use non-hyphenated or otherwise non-IANA internal
37 : /// names (e.g. `windows1252`, `latin-2`, `latin-9`). Owning this map makes
38 : /// the public API contract stable and independent of any upstream name changes
39 : /// in the `charset` package.
40 3 : final Map<Encoding, String> _ianaLabels = {
41 1 : windows1252: 'windows-1252',
42 : latin1: 'iso-8859-1',
43 1 : latin2: 'iso-8859-2',
44 1 : latin9: 'iso-8859-15',
45 : shiftJis: 'shift-jis',
46 : eucJp: 'euc-jp',
47 : eucKr: 'euc-kr',
48 : gbk: 'gbk',
49 : };
50 :
51 : /// Candidate encodings to probe in Stage 3, partitioned into Western and CJK
52 : /// groups. The probe order within each group is significant: the first
53 : /// candidate that successfully decodes the sample wins. CJK candidates are
54 : /// promoted to the front of the list when the sample contains a high
55 : /// proportion of high bytes (see [_looksMultibyte]).
56 : ///
57 : /// Ordering rationale for Western candidates:
58 : /// - `windows-1252` is first because it is the most common legacy Western
59 : /// encoding and accepts almost any byte sequence. Placing it first means
60 : /// that if no CJK encoding matches, `windows-1252` wins immediately;
61 : /// ISO-8859 variants follow to give them a chance before the fallback.
62 : /// - `iso-8859-1` is a strict subset of `windows-1252` so it will match
63 : /// whenever `windows-1252` would, but it is listed here for completeness
64 : /// and to exercise the probe path in tests.
65 : /// - `iso-8859-2` and `iso-8859-15` are more restrictive and can reject
66 : /// byte sequences that `windows-1252` accepts.
67 3 : final List<Encoding> _westernCandidates = [
68 1 : windows1252,
69 : latin1, // iso-8859-1
70 1 : latin2, // iso-8859-2
71 1 : latin9, // iso-8859-15
72 : ];
73 :
74 3 : final List<Encoding> _cjkCandidates = [shiftJis, eucJp, eucKr, gbk];
75 :
76 : /// Detects the character encoding of the given byte sequence.
77 : ///
78 : /// Returns a lowercase IANA encoding label string. The following labels may
79 : /// be returned:
80 : ///
81 : /// - BOM-detected: `'utf-8'`, `'utf-16be'`, `'utf-16le'`, `'utf-32be'`,
82 : /// `'utf-32le'`
83 : /// - UTF-8 structural: `'utf-8'`
84 : /// - Legacy 8-bit: `'windows-1252'`, `'iso-8859-1'`, `'iso-8859-2'`,
85 : /// `'iso-8859-15'`
86 : /// - CJK multi-byte: `'shift-jis'`, `'euc-jp'`, `'euc-kr'`, `'gbk'`
87 : /// - Fallback: `'windows-1252'`
88 : ///
89 : /// Detection proceeds through three ordered stages, falling through only
90 : /// when the current stage cannot make a determination:
91 : ///
92 : /// **Stage 1 — BOM inspection (deterministic)**
93 : /// A byte-order mark (BOM), when present, is authoritative. The four-byte
94 : /// UTF-32 BOMs are checked before the two-byte UTF-16 BOMs to prevent
95 : /// UTF-32 LE from being misidentified as UTF-16 LE (they share the same
96 : /// first two bytes: `FF FE`).
97 : ///
98 : /// **Stage 2 — UTF-8 structural validation**
99 : /// A leading 8 KB sample of the input is decoded with
100 : /// `utf8.decode(allowMalformed: false)`. A successful decode means the
101 : /// content is UTF-8 (or pure ASCII, which is a strict UTF-8 subset). Empty
102 : /// input passes this stage and returns `'utf-8'`.
103 : ///
104 : /// **Stage 3 — Candidate probe via the `charset` package**
105 : /// The sample is tested against each candidate [Encoding] using the static
106 : /// [Charset.canDecode] method. CJK encodings are promoted when more than 15%
107 : /// of sample bytes are ≥ `0x80`. The first candidate to successfully decode
108 : /// the sample wins. If no candidate matches, `'windows-1252'` is returned as
109 : /// the fallback (following the WHATWG Encoding specification default).
110 : ///
111 : /// Note: [Charset.canDecode] considers a decode invalid if the resulting
112 : /// string contains the Unicode replacement character U+FFFD (`'?'`). Input
113 : /// that legitimately contains U+FFFD will therefore be rejected by every
114 : /// non-UTF codec regardless of its actual encoding. This is a known
115 : /// limitation of the structural validity approach.
116 : ///
117 : /// Example (web-safe — works on all platforms):
118 : /// ```dart
119 : /// import 'dart:typed_data';
120 : /// import 'package:betto_charset_detector/betto_charset_detector.dart';
121 : ///
122 : /// void main() {
123 : /// // Bytes may come from an HTTP response, file picker, dart:io, etc.
124 : /// final bytes = Uint8List.fromList([0xEF, 0xBB, 0xBF, 104, 101, 108, 108, 111]);
125 : /// final encoding = detectCharset(bytes);
126 : /// print('Detected encoding: $encoding'); // utf-8
127 : /// }
128 : /// ```
129 : ///
130 : /// On native platforms only, you can read bytes from a file:
131 : /// ```dart
132 : /// import 'dart:io';
133 : /// import 'package:betto_charset_detector/betto_charset_detector.dart';
134 : ///
135 : /// void main() {
136 : /// final bytes = File('data.csv').readAsBytesSync();
137 : /// final encoding = detectCharset(bytes);
138 : /// print('Detected encoding: $encoding');
139 : /// }
140 : /// ```
141 1 : String detectCharset(Uint8List bytes) {
142 : // Extract a leading sample to limit memory usage.
143 : // The sample cap is applied first so that all subsequent stages operate
144 : // on the same bounded input.
145 2 : final sample = bytes.length > _sampleSize
146 1 : ? Uint8List.sublistView(bytes, 0, _sampleSize)
147 : : bytes;
148 :
149 : // Stage 1: BOM inspection.
150 1 : final bomResult = _detectBom(sample);
151 : if (bomResult != null) {
152 : return bomResult;
153 : }
154 :
155 : // Stage 2: UTF-8 structural validation.
156 1 : if (_isValidUtf8(sample)) {
157 : return 'utf-8';
158 : }
159 :
160 : // Stage 3: Candidate probe.
161 1 : return _probeEncoding(sample);
162 : }
163 :
164 : /// Returns the IANA encoding label indicated by a byte-order mark at the
165 : /// start of [bytes], or `null` if no recognised BOM is present.
166 : ///
167 : /// Four-byte BOMs are checked before two-byte BOMs. This ordering is
168 : /// essential to correctly distinguish UTF-32 LE (`FF FE 00 00`) from UTF-16
169 : /// LE (`FF FE`), as the two sequences share the same first two bytes.
170 : ///
171 : /// | BOM bytes | Returned label |
172 : /// | :-------------- | :------------- |
173 : /// | `00 00 FE FF` | `'utf-32be'` |
174 : /// | `FF FE 00 00` | `'utf-32le'` |
175 : /// | `EF BB BF` | `'utf-8'` |
176 : /// | `FE FF` | `'utf-16be'` |
177 : /// | `FF FE` | `'utf-16le'` |
178 1 : String? _detectBom(Uint8List bytes) {
179 1 : final len = bytes.length;
180 :
181 : // Check 4-byte BOMs first to avoid misidentifying UTF-32 as UTF-16.
182 1 : if (len >= 4) {
183 : // UTF-32 BE BOM: 00 00 FE FF
184 2 : if (bytes[0] == 0x00 &&
185 2 : bytes[1] == 0x00 &&
186 2 : bytes[2] == 0xFE &&
187 2 : bytes[3] == 0xFF) {
188 : return 'utf-32be';
189 : }
190 : // UTF-32 LE BOM: FF FE 00 00
191 2 : if (bytes[0] == 0xFF &&
192 2 : bytes[1] == 0xFE &&
193 2 : bytes[2] == 0x00 &&
194 2 : bytes[3] == 0x00) {
195 : return 'utf-32le';
196 : }
197 : }
198 :
199 : // Check 3-byte BOMs.
200 1 : if (len >= 3) {
201 : // UTF-8 BOM: EF BB BF
202 6 : if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
203 : return 'utf-8';
204 : }
205 : }
206 :
207 : // Check 2-byte BOMs.
208 1 : if (len >= 2) {
209 : // UTF-16 BE BOM: FE FF
210 4 : if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
211 : return 'utf-16be';
212 : }
213 : // UTF-16 LE BOM: FF FE
214 4 : if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
215 : return 'utf-16le';
216 : }
217 : }
218 :
219 : return null;
220 : }
221 :
222 : /// Returns `true` if [bytes] can be successfully decoded as UTF-8.
223 : ///
224 : /// Uses [utf8.decode] with `allowMalformed: false`. Any [FormatException]
225 : /// (invalid byte sequence) causes the method to return `false`. Note that
226 : /// this is a structural validity check — it does not distinguish UTF-8 from
227 : /// pure ASCII (ASCII is a strict subset of UTF-8 and will return `true`).
228 : ///
229 : /// An empty byte sequence returns `true` because it is vacuously valid UTF-8.
230 1 : bool _isValidUtf8(Uint8List bytes) {
231 : try {
232 : // ignore: unnecessary_ignore
233 1 : utf8.decode(bytes, allowMalformed: false);
234 : return true;
235 1 : } on FormatException {
236 : return false;
237 : }
238 : }
239 :
240 : /// Returns `true` when more than 15% of the bytes in [sample] are ≥ `0x80`.
241 : ///
242 : /// This heuristic is used to promote CJK candidate encodings (which use
243 : /// multi-byte sequences in the high byte range) to the front of the probe
244 : /// list. A threshold of >15% is chosen to avoid false promotion on Western
245 : /// 8-bit text that may have a small number of accented characters.
246 1 : bool _looksMultibyte(Uint8List sample) {
247 1 : if (sample.isEmpty) return false;
248 : var highByteCount = 0;
249 2 : for (final b in sample) {
250 2 : if (b >= 0x80) highByteCount++;
251 : }
252 : // Promote CJK when more than 15% of bytes are in the high range.
253 3 : return highByteCount / sample.length > 0.15;
254 : }
255 :
256 : /// Probes [bytes] against each candidate encoding using
257 : /// [Charset.canDecode] and returns the IANA label of the first matching
258 : /// encoding.
259 : ///
260 : /// CJK candidates are moved to the front of the probe order when
261 : /// [_looksMultibyte] returns `true` for [bytes]. This reduces false Western
262 : /// matches on CJK content with dense high-byte sequences.
263 : ///
264 : /// The probe is a structural validity check: [Charset.canDecode] decodes the
265 : /// sample and rejects it if the decoded string contains the Unicode
266 : /// replacement character U+FFFD. This makes the probe reliable for CJK
267 : /// multi-byte encodings (which have strict structural constraints) but less
268 : /// discriminating among Western 8-bit encodings (which can accept almost any
269 : /// byte sequence).
270 : ///
271 : /// Returns `'windows-1252'` if no candidate matches. This follows the WHATWG
272 : /// Encoding specification fallback, as Windows-1252 is the most common legacy
273 : /// Western encoding and a superset of ISO-8859-1.
274 1 : String _probeEncoding(Uint8List bytes) {
275 : // Build the probe order based on whether the sample looks like multi-byte
276 : // (CJK) content.
277 1 : final candidates = _looksMultibyte(bytes)
278 3 : ? [..._cjkCandidates, ..._westernCandidates]
279 3 : : [..._westernCandidates, ..._cjkCandidates];
280 :
281 2 : for (final encoding in candidates) {
282 1 : if (Charset.canDecode(encoding, bytes)) {
283 : // Return the IANA label from the explicit map rather than encoding.name,
284 : // because some codecs use non-hyphenated or non-IANA internal names.
285 2 : return _ianaLabels[encoding] ?? encoding.name;
286 : }
287 : }
288 :
289 : // Fallback: windows-1252 per the WHATWG Encoding specification.
290 : return 'windows-1252';
291 : }
|