|
|
var TokenStream = require('../common/TokenStream'); var adoptBuffer = require('../common/adopt-buffer');
var constants = require('./const'); var TYPE = constants.TYPE;
var charCodeDefinitions = require('./char-code-definitions'); var isNewline = charCodeDefinitions.isNewline; var isName = charCodeDefinitions.isName; var isValidEscape = charCodeDefinitions.isValidEscape; var isNumberStart = charCodeDefinitions.isNumberStart; var isIdentifierStart = charCodeDefinitions.isIdentifierStart; var charCodeCategory = charCodeDefinitions.charCodeCategory; var isBOM = charCodeDefinitions.isBOM;
var utils = require('./utils'); var cmpStr = utils.cmpStr; var getNewlineLength = utils.getNewlineLength; var findWhiteSpaceEnd = utils.findWhiteSpaceEnd; var consumeEscaped = utils.consumeEscaped; var consumeName = utils.consumeName; var consumeNumber = utils.consumeNumber; var consumeBadUrlRemnants = utils.consumeBadUrlRemnants;
var OFFSET_MASK = 0x00FFFFFF; var TYPE_SHIFT = 24;
function tokenize(source, stream) { function getCharCode(offset) { return offset < sourceLength ? source.charCodeAt(offset) : 0; }
// § 4.3.3. Consume a numeric token
function consumeNumericToken() { // Consume a number and let number be the result.
offset = consumeNumber(source, offset);
// If the next 3 input code points would start an identifier, then:
if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) { // Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string.
// Consume a name. Set the <dimension-token>’s unit to the returned value.
// Return the <dimension-token>.
type = TYPE.Dimension; offset = consumeName(source, offset); return; }
// Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it.
if (getCharCode(offset) === 0x0025) { // Create a <percentage-token> with the same value as number, and return it.
type = TYPE.Percentage; offset++; return; }
// Otherwise, create a <number-token> with the same value and type flag as number, and return it.
type = TYPE.Number; }
// § 4.3.4. Consume an ident-like token
function consumeIdentLikeToken() { const nameStartOffset = offset;
// Consume a name, and let string be the result.
offset = consumeName(source, offset);
// If string’s value is an ASCII case-insensitive match for "url",
// and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
if (cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) { // While the next two input code points are whitespace, consume the next input code point.
offset = findWhiteSpaceEnd(source, offset + 1);
// If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('),
// or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('),
// then create a <function-token> with its value set to string and return it.
if (getCharCode(offset) === 0x0022 || getCharCode(offset) === 0x0027) { type = TYPE.Function; offset = nameStartOffset + 4; return; }
// Otherwise, consume a url token, and return it.
consumeUrlToken(); return; }
// Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
// Create a <function-token> with its value set to string and return it.
if (getCharCode(offset) === 0x0028) { type = TYPE.Function; offset++; return; }
// Otherwise, create an <ident-token> with its value set to string and return it.
type = TYPE.Ident; }
// § 4.3.5. Consume a string token
function consumeStringToken(endingCodePoint) { // This algorithm may be called with an ending code point, which denotes the code point
// that ends the string. If an ending code point is not specified,
// the current input code point is used.
if (!endingCodePoint) { endingCodePoint = getCharCode(offset++); }
// Initially create a <string-token> with its value set to the empty string.
type = TYPE.String;
// Repeatedly consume the next input code point from the stream:
for (; offset < source.length; offset++) { var code = source.charCodeAt(offset);
switch (charCodeCategory(code)) { // ending code point
case endingCodePoint: // Return the <string-token>.
offset++; return;
// EOF
case charCodeCategory.Eof: // This is a parse error. Return the <string-token>.
return;
// newline
case charCodeCategory.WhiteSpace: if (isNewline(code)) { // This is a parse error. Reconsume the current input code point,
// create a <bad-string-token>, and return it.
offset += getNewlineLength(source, offset, code); type = TYPE.BadString; return; } break;
// U+005C REVERSE SOLIDUS (\)
case 0x005C: // If the next input code point is EOF, do nothing.
if (offset === source.length - 1) { break; }
var nextCode = getCharCode(offset + 1);
// Otherwise, if the next input code point is a newline, consume it.
if (isNewline(nextCode)) { offset += getNewlineLength(source, offset + 1, nextCode); } else if (isValidEscape(code, nextCode)) { // Otherwise, (the stream starts with a valid escape) consume
// an escaped code point and append the returned code point to
// the <string-token>’s value.
offset = consumeEscaped(source, offset) - 1; } break;
// anything else
// Append the current input code point to the <string-token>’s value.
} } }
// § 4.3.6. Consume a url token
// Note: This algorithm assumes that the initial "url(" has already been consumed.
// This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo).
// A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token
// automatically handles this distinction; this algorithm shouldn’t be called directly otherwise.
function consumeUrlToken() { // Initially create a <url-token> with its value set to the empty string.
type = TYPE.Url;
// Consume as much whitespace as possible.
offset = findWhiteSpaceEnd(source, offset);
// Repeatedly consume the next input code point from the stream:
for (; offset < source.length; offset++) { var code = source.charCodeAt(offset);
switch (charCodeCategory(code)) { // U+0029 RIGHT PARENTHESIS ())
case 0x0029: // Return the <url-token>.
offset++; return;
// EOF
case charCodeCategory.Eof: // This is a parse error. Return the <url-token>.
return;
// whitespace
case charCodeCategory.WhiteSpace: // Consume as much whitespace as possible.
offset = findWhiteSpaceEnd(source, offset);
// If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
// consume it and return the <url-token>
// (if EOF was encountered, this is a parse error);
if (getCharCode(offset) === 0x0029 || offset >= source.length) { if (offset < source.length) { offset++; } return; }
// otherwise, consume the remnants of a bad url, create a <bad-url-token>,
// and return it.
offset = consumeBadUrlRemnants(source, offset); type = TYPE.BadUrl; return;
// U+0022 QUOTATION MARK (")
// U+0027 APOSTROPHE (')
// U+0028 LEFT PARENTHESIS (()
// non-printable code point
case 0x0022: case 0x0027: case 0x0028: case charCodeCategory.NonPrintable: // This is a parse error. Consume the remnants of a bad url,
// create a <bad-url-token>, and return it.
offset = consumeBadUrlRemnants(source, offset); type = TYPE.BadUrl; return;
// U+005C REVERSE SOLIDUS (\)
case 0x005C: // If the stream starts with a valid escape, consume an escaped code point and
// append the returned code point to the <url-token>’s value.
if (isValidEscape(code, getCharCode(offset + 1))) { offset = consumeEscaped(source, offset) - 1; break; }
// Otherwise, this is a parse error. Consume the remnants of a bad url,
// create a <bad-url-token>, and return it.
offset = consumeBadUrlRemnants(source, offset); type = TYPE.BadUrl; return;
// anything else
// Append the current input code point to the <url-token>’s value.
} } }
if (!stream) { stream = new TokenStream(); }
// ensure source is a string
source = String(source || '');
var sourceLength = source.length; var offsetAndType = adoptBuffer(stream.offsetAndType, sourceLength + 1); // +1 because of eof-token
var balance = adoptBuffer(stream.balance, sourceLength + 1); var tokenCount = 0; var start = isBOM(getCharCode(0)); var offset = start; var balanceCloseType = 0; var balanceStart = 0; var balancePrev = 0;
// https://drafts.csswg.org/css-syntax-3/#consume-token
// § 4.3.1. Consume a token
while (offset < sourceLength) { var code = source.charCodeAt(offset); var type = 0;
balance[tokenCount] = sourceLength;
switch (charCodeCategory(code)) { // whitespace
case charCodeCategory.WhiteSpace: // Consume as much whitespace as possible. Return a <whitespace-token>.
type = TYPE.WhiteSpace; offset = findWhiteSpaceEnd(source, offset + 1); break;
// U+0022 QUOTATION MARK (")
case 0x0022: // Consume a string token and return it.
consumeStringToken(); break;
// U+0023 NUMBER SIGN (#)
case 0x0023: // If the next input code point is a name code point or the next two input code points are a valid escape, then:
if (isName(getCharCode(offset + 1)) || isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) { // Create a <hash-token>.
type = TYPE.Hash;
// If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id".
// if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
// // TODO: set id flag
// }
// Consume a name, and set the <hash-token>’s value to the returned string.
offset = consumeName(source, offset + 1);
// Return the <hash-token>.
} else { // Otherwise, return a <delim-token> with its value set to the current input code point.
type = TYPE.Delim; offset++; }
break;
// U+0027 APOSTROPHE (')
case 0x0027: // Consume a string token and return it.
consumeStringToken(); break;
// U+0028 LEFT PARENTHESIS (()
case 0x0028: // Return a <(-token>.
type = TYPE.LeftParenthesis; offset++; break;
// U+0029 RIGHT PARENTHESIS ())
case 0x0029: // Return a <)-token>.
type = TYPE.RightParenthesis; offset++; break;
// U+002B PLUS SIGN (+)
case 0x002B: // If the input stream starts with a number, ...
if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { // ... reconsume the current input code point, consume a numeric token, and return it.
consumeNumericToken(); } else { // Otherwise, return a <delim-token> with its value set to the current input code point.
type = TYPE.Delim; offset++; } break;
// U+002C COMMA (,)
case 0x002C: // Return a <comma-token>.
type = TYPE.Comma; offset++; break;
// U+002D HYPHEN-MINUS (-)
case 0x002D: // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { consumeNumericToken(); } else { // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
if (getCharCode(offset + 1) === 0x002D && getCharCode(offset + 2) === 0x003E) { type = TYPE.CDC; offset = offset + 3; } else { // Otherwise, if the input stream starts with an identifier, ...
if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { // ... reconsume the current input code point, consume an ident-like token, and return it.
consumeIdentLikeToken(); } else { // Otherwise, return a <delim-token> with its value set to the current input code point.
type = TYPE.Delim; offset++; } } } break;
// U+002E FULL STOP (.)
case 0x002E: // If the input stream starts with a number, ...
if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { // ... reconsume the current input code point, consume a numeric token, and return it.
consumeNumericToken(); } else { // Otherwise, return a <delim-token> with its value set to the current input code point.
type = TYPE.Delim; offset++; }
break;
// U+002F SOLIDUS (/)
case 0x002F: // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*),
if (getCharCode(offset + 1) === 0x002A) { // ... consume them and all following code points up to and including the first U+002A ASTERISK (*)
// followed by a U+002F SOLIDUS (/), or up to an EOF code point.
type = TYPE.Comment; offset = source.indexOf('*/', offset + 2) + 2; if (offset === 1) { offset = source.length; } } else { type = TYPE.Delim; offset++; } break;
// U+003A COLON (:)
case 0x003A: // Return a <colon-token>.
type = TYPE.Colon; offset++; break;
// U+003B SEMICOLON (;)
case 0x003B: // Return a <semicolon-token>.
type = TYPE.Semicolon; offset++; break;
// U+003C LESS-THAN SIGN (<)
case 0x003C: // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ...
if (getCharCode(offset + 1) === 0x0021 && getCharCode(offset + 2) === 0x002D && getCharCode(offset + 3) === 0x002D) { // ... consume them and return a <CDO-token>.
type = TYPE.CDO; offset = offset + 4; } else { // Otherwise, return a <delim-token> with its value set to the current input code point.
type = TYPE.Delim; offset++; }
break;
// U+0040 COMMERCIAL AT (@)
case 0x0040: // If the next 3 input code points would start an identifier, ...
if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) { // ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it.
type = TYPE.AtKeyword; offset = consumeName(source, offset + 1); } else { // Otherwise, return a <delim-token> with its value set to the current input code point.
type = TYPE.Delim; offset++; }
break;
// U+005B LEFT SQUARE BRACKET ([)
case 0x005B: // Return a <[-token>.
type = TYPE.LeftSquareBracket; offset++; break;
// U+005C REVERSE SOLIDUS (\)
case 0x005C: // If the input stream starts with a valid escape, ...
if (isValidEscape(code, getCharCode(offset + 1))) { // ... reconsume the current input code point, consume an ident-like token, and return it.
consumeIdentLikeToken(); } else { // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
type = TYPE.Delim; offset++; } break;
// U+005D RIGHT SQUARE BRACKET (])
case 0x005D: // Return a <]-token>.
type = TYPE.RightSquareBracket; offset++; break;
// U+007B LEFT CURLY BRACKET ({)
case 0x007B: // Return a <{-token>.
type = TYPE.LeftCurlyBracket; offset++; break;
// U+007D RIGHT CURLY BRACKET (})
case 0x007D: // Return a <}-token>.
type = TYPE.RightCurlyBracket; offset++; break;
// digit
case charCodeCategory.Digit: // Reconsume the current input code point, consume a numeric token, and return it.
consumeNumericToken(); break;
// name-start code point
case charCodeCategory.NameStart: // Reconsume the current input code point, consume an ident-like token, and return it.
consumeIdentLikeToken(); break;
// EOF
case charCodeCategory.Eof: // Return an <EOF-token>.
break;
// anything else
default: // Return a <delim-token> with its value set to the current input code point.
type = TYPE.Delim; offset++; }
switch (type) { case balanceCloseType: balancePrev = balanceStart & OFFSET_MASK; balanceStart = balance[balancePrev]; balanceCloseType = balanceStart >> TYPE_SHIFT; balance[tokenCount] = balancePrev; balance[balancePrev++] = tokenCount; for (; balancePrev < tokenCount; balancePrev++) { if (balance[balancePrev] === sourceLength) { balance[balancePrev] = tokenCount; } } break;
case TYPE.LeftParenthesis: case TYPE.Function: balance[tokenCount] = balanceStart; balanceCloseType = TYPE.RightParenthesis; balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount; break;
case TYPE.LeftSquareBracket: balance[tokenCount] = balanceStart; balanceCloseType = TYPE.RightSquareBracket; balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount; break;
case TYPE.LeftCurlyBracket: balance[tokenCount] = balanceStart; balanceCloseType = TYPE.RightCurlyBracket; balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount; break; }
offsetAndType[tokenCount++] = (type << TYPE_SHIFT) | offset; }
// finalize buffers
offsetAndType[tokenCount] = (TYPE.EOF << TYPE_SHIFT) | offset; // <EOF-token>
balance[tokenCount] = sourceLength; balance[sourceLength] = sourceLength; // prevents false positive balance match with any token
while (balanceStart !== 0) { balancePrev = balanceStart & OFFSET_MASK; balanceStart = balance[balancePrev]; balance[balancePrev] = sourceLength; }
// update stream
stream.source = source; stream.firstCharOffset = start; stream.offsetAndType = offsetAndType; stream.tokenCount = tokenCount; stream.balance = balance; stream.reset(); stream.next();
return stream; }
// extend tokenizer with constants
Object.keys(constants).forEach(function(key) { tokenize[key] = constants[key]; });
// extend tokenizer with static methods from utils
Object.keys(charCodeDefinitions).forEach(function(key) { tokenize[key] = charCodeDefinitions[key]; }); Object.keys(utils).forEach(function(key) { tokenize[key] = utils[key]; });
module.exports = tokenize;
|