You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
"use strict"; const whatwgEncoding = require("whatwg-encoding");
// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
module.exports = (buffer, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => { let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910
if (encoding === null && transportLayerEncodingLabel !== undefined) { encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel); }
if (encoding === null) { encoding = prescanMetaCharset(buffer); }
if (encoding === null) { encoding = defaultEncoding; }
return encoding; };
// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
function prescanMetaCharset(buffer) { const l = Math.min(buffer.length, 1024); for (let i = 0; i < l; i++) { let c = buffer[i]; if (c === 0x3C) { // "<"
const c1 = buffer[i + 1]; const c2 = buffer[i + 2]; const c3 = buffer[i + 3]; const c4 = buffer[i + 4]; const c5 = buffer[i + 5]; // !-- (comment start)
if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) { i += 4; for (; i < l; i++) { c = buffer[i]; const cMinus1 = buffer[i - 1]; const cMinus2 = buffer[i - 2]; // --> (comment end)
if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) { break; } } } else if ((c1 === 0x4D || c1 === 0x6D) && (c2 === 0x45 || c2 === 0x65) && (c3 === 0x54 || c3 === 0x74) && (c4 === 0x41 || c4 === 0x61) && (isSpaceCharacter(c5) || c5 === 0x2F)) { // "meta" + space or /
i += 6; const attributeList = new Set(); let gotPragma = false; let needPragma = null; let charset = null;
let attrRes; do { attrRes = getAttribute(buffer, i, l); if (attrRes.attr && !attributeList.has(attrRes.attr.name)) { attributeList.add(attrRes.attr.name); if (attrRes.attr.name === "http-equiv") { gotPragma = attrRes.attr.value === "content-type"; } else if (attrRes.attr.name === "content" && !charset) { charset = extractCharacterEncodingFromMeta(attrRes.attr.value); if (charset !== null) { needPragma = true; } } else if (attrRes.attr.name === "charset") { charset = whatwgEncoding.labelToName(attrRes.attr.value); needPragma = false; } } i = attrRes.i; } while (attrRes.attr);
if (needPragma === null) { continue; } if (needPragma === true && gotPragma === false) { continue; } if (charset === null) { continue; }
if (charset === "UTF-16LE" || charset === "UTF-16BE") { charset = "UTF-8"; } if (charset === "x-user-defined") { charset = "windows-1252"; }
return charset; } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) { // a-z or A-Z
for (i += 2; i < l; i++) { c = buffer[i]; // space or >
if (isSpaceCharacter(c) || c === 0x3E) { break; } } let attrRes; do { attrRes = getAttribute(buffer, i, l); i = attrRes.i; } while (attrRes.attr); } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) { // ! or / or ?
for (i += 2; i < l; i++) { c = buffer[i]; // >
if (c === 0x3E) { break; } } } } } return null; }
// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
function getAttribute(buffer, i, l) { for (; i < l; i++) { let c = buffer[i]; // space or /
if (isSpaceCharacter(c) || c === 0x2F) { continue; } // ">"
if (c === 0x3E) { break; } let name = ""; let value = ""; nameLoop:for (; i < l; i++) { c = buffer[i]; // "="
if (c === 0x3D && name !== "") { i++; break; } // space
if (isSpaceCharacter(c)) { for (i++; i < l; i++) { c = buffer[i]; // space
if (isSpaceCharacter(c)) { continue; } // not "="
if (c !== 0x3D) { return { attr: { name, value }, i }; }
i++; break nameLoop; } break; } // / or >
if (c === 0x2F || c === 0x3E) { return { attr: { name, value }, i }; } // A-Z
if (c >= 0x41 && c <= 0x5A) { name += String.fromCharCode(c + 0x20); // lowercase
} else { name += String.fromCharCode(c); } } c = buffer[i]; // space
if (isSpaceCharacter(c)) { for (i++; i < l; i++) { c = buffer[i]; // space
if (isSpaceCharacter(c)) { continue; } else { break; } } } // " or '
if (c === 0x22 || c === 0x27) { const quote = c; for (i++; i < l; i++) { c = buffer[i];
if (c === quote) { i++; return { attr: { name, value }, i }; }
// A-Z
if (c >= 0x41 && c <= 0x5A) { value += String.fromCharCode(c + 0x20); // lowercase
} else { value += String.fromCharCode(c); } } }
// >
if (c === 0x3E) { return { attr: { name, value }, i }; }
// A-Z
if (c >= 0x41 && c <= 0x5A) { value += String.fromCharCode(c + 0x20); // lowercase
} else { value += String.fromCharCode(c); }
for (i++; i < l; i++) { c = buffer[i];
// space or >
if (isSpaceCharacter(c) || c === 0x3E) { return { attr: { name, value }, i }; }
// A-Z
if (c >= 0x41 && c <= 0x5A) { value += String.fromCharCode(c + 0x20); // lowercase
} else { value += String.fromCharCode(c); } } } return { i }; }
function extractCharacterEncodingFromMeta(string) { let position = 0;
while (true) { const indexOfCharset = string.substring(position).search(/charset/i);
if (indexOfCharset === -1) { return null; } let subPosition = position + indexOfCharset + "charset".length;
while (isSpaceCharacter(string[subPosition].charCodeAt(0))) { ++subPosition; }
if (string[subPosition] !== "=") { position = subPosition - 1; continue; }
++subPosition;
while (isSpaceCharacter(string[subPosition].charCodeAt(0))) { ++subPosition; }
position = subPosition; break; }
if (string[position] === "\"" || string[position] === "'") { const nextIndex = string.indexOf(string[position], position + 1);
if (nextIndex !== -1) { return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex)); }
// It is an unmatched quotation mark
return null; }
if (string.length === position + 1) { return null; }
const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/); const end = indexOfASCIIWhitespaceOrSemicolon === -1 ? string.length : position + indexOfASCIIWhitespaceOrSemicolon + 1;
return whatwgEncoding.labelToName(string.substring(position, end)); }
function isSpaceCharacter(c) { return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20; }
|