lambdaworld-archive/node_modules/shift-regexp-acceptor/src/index.js

755 lines
23 KiB
JavaScript

/**
* Copyright 2018 Shape Security, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License")
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* eslint-disable no-use-before-define */
const matchPropertyValue = require('unicode-match-property-value-ecmascript');
const matchPropertyValueMappings = require('unicode-match-property-value-ecmascript/data/mappings');
const matchProperty = require('unicode-match-property-ecmascript');
const propertyAliases = require('unicode-property-aliases-ecmascript');
const { idContinueBool, idContinueLargeRegex, idStartBool, idStartLargeRegex } = require('./unicode');
const catchIsFalse = predicate => {
try {
return !!predicate();
} catch (e) {
return false;
}
};
const syntaxCharacters = '^$\\.*+?()[]{}|'.split('');
const extendedSyntaxCharacters = '^$\\.*+?()[|'.split('');
const controlEscapeCharacters = 'fnrtv'.split('');
const controlEscapeCharacterValues = { 'f': '\f'.charCodeAt(0), 'n': '\n'.charCodeAt(0), 'r': '\r'.charCodeAt(0), 't': '\t'.charCodeAt(0), 'v': '\v'.charCodeAt(0) };
const controlCharacters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.split('');
const hexDigits = '0123456789abcdefABCDEF'.split('');
const decimalDigits = '0123456789'.split('');
const octalDigits = '01234567'.split('');
const INVALID_NAMED_BACKREFERENCE_SENTINEL = {};
function isIdentifierStart(ch) {
return ch < 128 ? idStartBool[ch] : idStartLargeRegex.test(String.fromCodePoint(ch));
}
function isIdentifierPart(ch) {
return ch < 128 ? idContinueBool[ch] : idContinueLargeRegex.test(String.fromCodePoint(ch));
}
class PatternAcceptorState {
constructor(pattern, unicode) {
this.pattern = pattern;
this.unicode = unicode;
this.index = 0;
this.largestBackreference = 0;
this.backreferenceNames = [];
this.groupingNames = [];
this.capturingGroups = 0;
}
empty() {
return this.index >= this.pattern.length;
}
backreference(ref) {
if (ref > this.largestBackreference) {
this.largestBackreference = ref;
}
}
nextCodePoint() {
if (this.empty()) {
return null;
}
if (this.unicode) {
return String.fromCodePoint(this.pattern.codePointAt(this.index));
}
return this.pattern.charAt(this.index);
}
skipCodePoint() {
this.index += this.nextCodePoint().length;
}
eat(str) {
if (this.index + str.length > this.pattern.length || this.pattern.slice(this.index, this.index + str.length) !== str) {
return false;
}
this.index += str.length;
return true;
}
eatIdentifierCodePoint() {
let characterValue;
let originalIndex = this.index;
let character;
if (this.match('\\u')) {
this.skipCodePoint();
characterValue = acceptUnicodeEscape(this);
if (!characterValue.matched) {
this.index = originalIndex;
return null;
}
characterValue = characterValue.value;
character = String.fromCodePoint(characterValue);
} else {
character = this.nextCodePoint();
if (character == null) {
this.index = originalIndex;
return null;
}
this.index += character.length;
characterValue = character.codePointAt(0);
}
return { character, characterValue };
}
eatIdentifierStart() {
let originalIndex = this.index;
let codePoint = this.eatIdentifierCodePoint();
if (codePoint === null) {
this.index = originalIndex;
return null;
}
if (codePoint.character === '_' || codePoint.character === '$' || isIdentifierStart(codePoint.characterValue)) {
return codePoint.character;
}
this.index = originalIndex;
return null;
}
eatIdentifierPart() {
let originalIndex = this.index;
let codePoint = this.eatIdentifierCodePoint();
if (codePoint === null) {
this.index = originalIndex;
return null;
}
// ZWNJ / ZWJ
if (codePoint.character === '\u200C' || codePoint.character === '\u200D' || codePoint.character === '$' || isIdentifierPart(codePoint.characterValue)) {
return codePoint.character;
}
this.index = originalIndex;
return null;
}
eatAny(...strs) {
for (let str of strs) {
if (this.eat(str)) {
return str;
}
}
return null;
}
match(str) {
return this.index + str.length <= this.pattern.length && this.pattern.slice(this.index, this.index + str.length) === str;
}
matchAny(...strs) {
for (let str of strs) {
if (this.match(str)) {
return true;
}
}
return false;
}
eatNaturalNumber() {
let characters = [];
let eatNumber = () => {
for (let str of decimalDigits) {
if (this.eat(str)) {
characters.push(str);
return true;
}
}
return false;
};
while (eatNumber());
return characters.length === 0 ? null : characters.join('');
}
}
// acceptRegex
module.exports = (pattern, { unicode = false } = {}) => {
let state = new PatternAcceptorState(pattern, unicode);
let accepted = acceptDisjunction(state);
if (accepted.matched) {
if (state.unicode) {
if (state.largestBackreference > state.capturingGroups) {
return false;
}
}
if (state.groupingNames.length > 0 || state.unicode) {
for (let backreferenceName of state.backreferenceNames) {
if (state.groupingNames.indexOf(backreferenceName) === -1) {
return false;
}
}
}
}
return accepted.matched;
};
const backtrackOnFailure = func => state => {
let savedIndex = state.index;
let oldBackreference = state.largestBackreference;
let oldCapturingGroups = state.capturingGroups;
let val = func(state);
if (!val.matched) {
state.index = savedIndex;
state.largestBackreference = oldBackreference;
state.capturingGroups = oldCapturingGroups;
}
return val;
};
const acceptUnicodeEscape = backtrackOnFailure(state => {
if (!state.eat('u')) {
return { matched: false };
}
if (state.unicode && state.eat('{')) {
let digits = [];
while (!state.eat('}')) {
let digit = state.eatAny(...hexDigits);
if (digit === null) {
return { matched: false };
}
digits.push(digit);
}
let value = parseInt(digits.join(''), 16);
return value > 0x10FFFF ? { matched: false } : { matched: true, value };
}
let digits = [0, 0, 0, 0].map(() => state.eatAny(...hexDigits));
if (digits.some(digit => digit === null)) {
return { matched: false };
}
let value = parseInt(digits.join(''), 16);
if (state.unicode && value >= 0xD800 && value <= 0xDBFF) {
let surrogatePairValue = backtrackOnFailure(subState => {
if (!subState.eat('\\u')) {
return { matched: false };
}
let digits2 = [0, 0, 0, 0].map(() => subState.eatAny(...hexDigits));
if (digits2.some(digit => digit === null)) {
return { matched: false };
}
let value2 = parseInt(digits2.join(''), 16);
if (value2 < 0xDC00 || value2 >= 0xE000) {
return { matched: false };
}
return { matched: true, value: 0x10000 + ((value & 0x03FF) << 10) + (value2 & 0x03FF) };
})(state);
if (surrogatePairValue.matched) {
return surrogatePairValue;
}
}
return { matched: true, value };
});
const acceptDisjunction = (state, terminator) => {
do {
if (terminator !== void 0 && state.eat(terminator)) {
return { matched: true };
} else if (state.match('|')) {
continue;
}
if (!acceptAlternative(state, terminator).matched) {
return { matched: false };
}
} while (state.eat('|'));
return { matched: terminator === void 0 || !!state.eat(terminator) };
};
const acceptAlternative = (state, terminator) => {
while (!state.match('|') && !state.empty() && (terminator === void 0 || !state.match(terminator))) {
if (!acceptTerm(state).matched) {
return { matched: false };
}
}
return { matched: true };
};
const anyOf = (...acceptors) => state => {
for (let predicate of acceptors) {
let value = predicate(state);
if (value.matched) {
return value;
}
}
return { matched: false };
};
const acceptTerm = state => {
// non-quantified references are rolled into quantified accepts to improve performance significantly.
if (state.unicode) {
return anyOf(acceptAssertion, acceptQuantified(acceptAtom))(state);
}
return anyOf(acceptQuantified(acceptQuantifiableAssertion),
acceptAssertion,
acceptQuantified(acceptAtom))(state);
};
const acceptLabeledGroup = predicate => backtrackOnFailure(state => {
if (!state.eat('(')) {
return { matched: false };
}
if (predicate(state)) {
return acceptDisjunction(state, ')');
}
return { matched: false };
});
const acceptQuantifiableAssertion = acceptLabeledGroup(state => !!state.eatAny('?=', '?!'));
const acceptAssertion = state => {
if (state.eatAny('^', '$', '\\b', '\\B')) {
return { matched: true };
}
return acceptLabeledGroup(subState => subState.unicode ? !!subState.eatAny('?=', '?!', '?<=', '?<!') : !!subState.eatAny('?<=', '?<!'))(state);
};
const acceptDecimal = state => {
return { matched: state.eatNaturalNumber() !== null };
};
const acceptQuantified = acceptor => backtrackOnFailure(state => {
if (!acceptor(state).matched) {
return { matched: false };
}
if (state.match('{')) {
let value = backtrackOnFailure(subState => {
subState.eat('{');
let num1 = subState.eatNaturalNumber();
if (num1 === null) {
return { matched: false };
}
if (subState.eat(',') && subState.matchAny(...decimalDigits)) {
let num2 = subState.eatNaturalNumber();
if (num2 === null || parseInt(num1) > parseInt(num2)) {
return { matched: false };
}
}
if (!subState.eat('}')) {
return { matched: false };
}
subState.eat('?');
return { matched: true };
})(state);
if (!value.matched) {
return { matched: !state.unicode };
}
return value;
} else if (state.eatAny('*', '+', '?')) {
state.eat('?');
}
return { matched: true };
});
const acceptCharacterExcept = characters => state => {
let nextCodePoint = state.nextCodePoint();
if (nextCodePoint === null || characters.indexOf(nextCodePoint) !== -1) {
return { matched: false };
}
state.skipCodePoint();
return { matched: true };
};
const acceptPatternCharacter = acceptCharacterExcept(syntaxCharacters);
const acceptExtendedPatternCharacter = acceptCharacterExcept(extendedSyntaxCharacters);
const acceptInvalidBracedQuantifier = state => {
return backtrackOnFailure(subState => {
return { matched: !!(subState.eat('{') && acceptDecimal(subState).matched && (!subState.eat(',') || subState.match('}') || acceptDecimal(subState).matched) && subState.eat('}')) };
})(state);
};
const acceptAtom = state => {
if (state.unicode) {
return anyOf(acceptPatternCharacter,
subState => ({ matched: !!subState.eat('.') }),
backtrackOnFailure(subState => subState.eat('\\') ? acceptAtomEscape(subState) : { matched: false }),
acceptCharacterClass,
acceptLabeledGroup(subState => subState.eat('?:')),
acceptGrouping)(state);
}
let matched = anyOf(
subState => ({ matched: !!subState.eat('.') }),
backtrackOnFailure(subState => subState.eat('\\') ? acceptAtomEscape(subState) : { matched: false }),
backtrackOnFailure(subState => ({ matched: subState.eat('\\') && subState.match('c') })),
acceptCharacterClass,
acceptLabeledGroup(subState => subState.eat('?:')),
acceptGrouping)(state);
if (!matched.matched && acceptInvalidBracedQuantifier(state).matched) {
return { matched: false };
}
return matched.matched ? matched : acceptExtendedPatternCharacter(state);
};
const acceptGrouping = backtrackOnFailure(state => {
if (!state.eat('(')) {
return { matched: false };
}
let groupName = backtrackOnFailure(subState => {
if (!state.eat('?')) {
return { matched: false };
}
return acceptGroupName(subState);
})(state);
if (!acceptDisjunction(state, ')').matched) {
return { matched: false };
}
if (groupName.matched) {
if (state.groupingNames.indexOf(groupName.data) !== -1) {
return { matched: false };
}
state.groupingNames.push(groupName.data);
}
state.capturingGroups++;
return { matched: true };
});
const acceptDecimalEscape = backtrackOnFailure(state => {
let firstDecimal = state.eatAny(...decimalDigits);
if (firstDecimal === null) {
return { matched: false };
}
if (firstDecimal === '0') {
return { matched: true };
}
// we also accept octal escapes here, but it is impossible to tell if it is a octal escape until all parsing is complete.
// octal escapes are handled in acceptCharacterEscape for classes
state.backreference(parseInt(firstDecimal + (state.eatNaturalNumber() || '')));
return { matched: true };
});
const acceptCharacterClassEscape = state => {
if (state.eatAny('d', 'D', 's', 'S', 'w', 'W')) {
return { matched: true };
}
if (state.unicode) {
return backtrackOnFailure(subState => {
if (!subState.eat('p{') && !subState.eat('P{')) {
return { matched: false };
}
if (!acceptUnicodePropertyValueExpression(subState).matched) {
return { matched: false };
}
return { matched: !!subState.eat('}') };
})(state);
}
return { matched: false };
};
const acceptUnicodePropertyName = state => {
let characters = [];
let character;
while (character = state.eatAny(...controlCharacters, '_')) { // eslint-disable-line no-cond-assign
characters.push(character);
}
return { matched: characters.length > 0, data: characters.join('') };
};
const acceptUnicodePropertyValue = state => {
let characters = [];
let character;
while (character = state.eatAny(...controlCharacters, ...decimalDigits, '_')) { // eslint-disable-line no-cond-assign
characters.push(character);
}
return { matched: characters.length > 0, data: characters.join('') };
};
// excluding nonbinary properties from mathias' list
// https://www.ecma-international.org/ecma-262/9.0/index.html#table-nonbinary-unicode-properties
const illegalLoneUnicodePropertyNames = [
'General_Category',
'Script',
'Script_Extensions',
'scx',
'sc',
'gc',
];
const generalCategoryValues = matchPropertyValueMappings.get('General_Category');
const acceptLoneUnicodePropertyNameOrValue = state => {
let loneValue = acceptUnicodePropertyValue(state);
if (!loneValue.matched || illegalLoneUnicodePropertyNames.includes(loneValue.data)) {
return { matched: false };
}
return { matched: catchIsFalse(() => matchProperty(loneValue.data)) || generalCategoryValues.get(loneValue.data) != null };
};
const acceptUnicodePropertyValueExpression = state =>
anyOf(backtrackOnFailure(subState => {
let name = acceptUnicodePropertyName(subState);
if (!name.matched || !subState.eat('=')) {
return { matched: false };
}
let value = acceptUnicodePropertyValue(subState);
if (!value.matched) {
return { matched: false };
}
return { matched: catchIsFalse(() => matchPropertyValue(propertyAliases.get(name.data) || name.data, value.data)) };
}),
backtrackOnFailure(acceptLoneUnicodePropertyNameOrValue))(state);
const acceptCharacterEscape = anyOf(
state => {
let eaten = state.eatAny(...controlEscapeCharacters);
if (eaten === null) {
return { matched: false };
}
return { matched: true, value: controlEscapeCharacterValues[eaten] };
},
backtrackOnFailure(state => {
if (!state.eat('c')) {
return { matched: false };
}
let character = state.eatAny(...controlCharacters);
if (character === null) {
return { matched: false };
}
return { matched: true, value: character.charCodeAt(0) % 32 };
}),
backtrackOnFailure(state => {
if (!state.eat('0') || state.eatAny(...decimalDigits)) {
return { matched: false };
}
return { matched: true, value: 0 };
}),
backtrackOnFailure(state => {
if (!state.eat('x')) {
return { matched: false };
}
let digits = [0, 0].map(() => state.eatAny(...hexDigits));
if (digits.some(value => value === null)) {
return { matched: false };
}
return { matched: true, value: parseInt(digits.join(''), 16) };
}),
acceptUnicodeEscape,
backtrackOnFailure(state => {
if (state.unicode) {
return { matched: false };
}
let octal1 = state.eatAny(...octalDigits);
if (octal1 === null) {
return { matched: false };
}
let octal1Value = parseInt(octal1, 8);
if (octalDigits.indexOf(state.nextCodePoint()) === -1) {
return { matched: true, value: octal1Value };
}
let octal2 = state.eatAny(...octalDigits);
let octal2Value = parseInt(octal2, 8);
if (octal1Value < 4) {
if (octalDigits.indexOf(state.nextCodePoint()) === -1) {
return { matched: true, value: octal1Value << 3 | octal2Value };
}
let octal3 = state.eatAny(...octalDigits);
let octal3Value = parseInt(octal3, 8);
return { matched: true, value: octal1Value << 6 | octal2Value << 3 | octal3Value };
}
return { matched: true, value: octal1Value << 3 | octal2Value };
}),
backtrackOnFailure(state => {
if (!state.unicode) {
return { matched: false };
}
let value = state.eatAny(...syntaxCharacters);
if (value === null) {
return { matched: false };
}
return { matched: true, value: value.charCodeAt(0) };
}),
state => {
if (!state.unicode || !state.eat('/')) {
return { matched: false };
}
return { matched: true, value: '/'.charCodeAt(0) };
},
backtrackOnFailure(state => {
if (state.unicode) {
return { matched: false };
}
let next = state.nextCodePoint();
if (next !== null && next !== 'c' && next !== 'k') {
state.skipCodePoint();
return { matched: true, value: next.codePointAt(0) };
}
return { matched: false };
})
);
const acceptGroupNameBackreference = backtrackOnFailure(state => {
if (!state.eat('k')) {
return { matched: false };
}
let name = acceptGroupName(state);
if (!name.matched) {
state.backreferenceNames.push(INVALID_NAMED_BACKREFERENCE_SENTINEL);
return { matched: true };
}
state.backreferenceNames.push(name.data);
return { matched: true };
});
const acceptGroupName = backtrackOnFailure(state => {
if (!state.eat('<')) {
return { matched: false };
}
let characters = [];
let start = state.eatIdentifierStart();
if (!start) {
return { matched: false };
}
characters.push(start);
let part;
while (part = state.eatIdentifierPart()) { // eslint-disable-line no-cond-assign
characters.push(part);
}
if (!state.eat('>')) {
return { matched: false };
}
return { matched: characters.length > 0, data: characters.join('') };
});
const acceptAtomEscape = anyOf(
acceptDecimalEscape,
acceptCharacterClassEscape,
acceptCharacterEscape,
acceptGroupNameBackreference
);
const acceptCharacterClass = backtrackOnFailure(state => {
if (!state.eat('[')) {
return { matched: false };
}
state.eat('^');
const acceptClassEscape = anyOf(
subState => {
return { matched: !!subState.eat('b'), value: 0x0008 };
},
subState => {
return { matched: subState.unicode && !!subState.eat('-'), value: '-'.charCodeAt(0) };
},
backtrackOnFailure(subState => {
if (subState.unicode || !subState.eat('c')) {
return { matched: false };
}
let character = subState.eatAny(...decimalDigits, '_');
if (character === null) {
return { matched: false };
}
return { matched: true, value: character.charCodeAt(0) % 32 };
}),
acceptCharacterClassEscape,
acceptCharacterEscape,
// We special-case `\k` because `acceptCharacterEscape` rejects `\k` unconditionally,
// deferring `\k` to acceptGroupNameBackreference, which is not called here.
// See also https://github.com/tc39/ecma262/issues/2037. This code takes the route of
// making it unconditionally legal, rather than legal only in the absence of a group name.
subState => {
return { matched: !subState.unicode && !!subState.eat('k'), value: 107 };
}
);
const acceptClassAtomNoDash = localState => {
let nextCodePoint = localState.nextCodePoint();
if (nextCodePoint === ']' || nextCodePoint === '-' || nextCodePoint === null) {
return { matched: false };
}
if (nextCodePoint !== '\\') {
localState.skipCodePoint();
return { matched: true, value: nextCodePoint.codePointAt(0) };
}
localState.eat('\\');
let classEscape = acceptClassEscape(localState);
if (!classEscape.matched && localState.nextCodePoint() === 'c' && !localState.unicode) {
return { matched: true, value: '\\'.charCodeAt(0) };
}
return classEscape;
};
const acceptClassAtom = localState => {
if (localState.eat('-')) {
return { matched: true, value: '-'.charCodeAt(0) };
}
return acceptClassAtomNoDash(localState);
};
const finishClassRange = (localState, atom) => {
const isUnvaluedPassedAtom = subAtom => {
return subAtom.value === void 0 && subAtom.matched;
};
if (localState.eat('-')) {
if (localState.match(']')) {
return { matched: true };
}
let otherAtom = acceptClassAtom(localState);
if (!otherAtom.matched) {
return { matched: false };
}
if (localState.unicode && (isUnvaluedPassedAtom(atom) || isUnvaluedPassedAtom(otherAtom))) {
return { matched: false };
} else if (!(!localState.unicode && (isUnvaluedPassedAtom(atom) || isUnvaluedPassedAtom(otherAtom))) && atom.value > otherAtom.value) {
return { matched: false };
} else if (localState.match(']')) {
return { matched: true };
}
return acceptNonEmptyClassRanges(localState);
}
if (localState.match(']')) {
return { matched: true };
}
return acceptNonEmptyClassRangesNoDash(localState);
};
const acceptNonEmptyClassRanges = localState => {
let atom = acceptClassAtom(localState);
return atom.matched ? finishClassRange(localState, atom) : { matched: false };
};
const acceptNonEmptyClassRangesNoDash = localState => {
let atom = acceptClassAtomNoDash(localState);
return atom.matched ? finishClassRange(localState, atom) : { matched: false };
};
if (state.eat(']')) {
return { matched: true };
}
let value = acceptNonEmptyClassRanges(state);
if (value.matched) {
state.eat(']'); // cannot fail, as above will not return matched if it is not seen in advance
}
return value;
});