1- import { Char , ReadonlyWord , Word } from "../core-types" ;
1+ import { Char } from "../core-types" ;
22import { CharSet } from "../char-set" ;
33import {
44 Node ,
@@ -16,7 +16,15 @@ import {
1616 NoParent ,
1717} from "../ast" ;
1818import { RegExpParser , AST , visitRegExpAST } from "regexpp" ;
19- import { assertNever , flatConcatSequences , repeatSequences , UnionIterable , unionSequences } from "../util" ;
19+ import {
20+ assertNever ,
21+ concatSequences ,
22+ flatConcatSequences ,
23+ repeatSequences ,
24+ UnionIterable ,
25+ unionSequences ,
26+ } from "../util" ;
27+ import { charSetToChars } from "../char-util" ;
2028import { createAssertion } from "./create-assertion" ;
2129import { createCharSet } from "./create-char-set" ;
2230import { UNICODE_MAXIMUM , UTF16_MAXIMUM } from "./util" ;
@@ -29,7 +37,8 @@ import {
2937 inheritedMatchingDirection ,
3038 somePathToBackreference ,
3139} from "./regexpp-util" ;
32- import { wordSetsToWords } from "../char-util" ;
40+ import { UTF16CaseFolding } from "./utf16-case-folding" ;
41+ import { UnicodeCaseFolding } from "./unicode" ;
3342
3443const DEFAULT_MAX_NODES = 10_000 ;
3544const DEFAULT_BACK_REF_MAX_WORDS = 100 ;
@@ -126,6 +135,10 @@ export interface ParseResult {
126135 maxCharacter : Char ;
127136}
128137
138+ type LogicalChar = Char | CharSet ;
139+ type LogicalWord = LogicalChar [ ] ;
140+ type ReadonlyLogicalWord = readonly LogicalChar [ ] ;
141+
129142interface ParserContext {
130143 readonly maxBackreferenceWords : number ;
131144 readonly backreferences : NonNullable < ParseOptions [ "backreferences" ] > ;
@@ -134,7 +147,7 @@ interface ParserContext {
134147
135148 readonly nc : NodeCreator ;
136149 readonly matchingDir : MatchingDirection ;
137- readonly variableResolved : ReadonlyMap < AST . CapturingGroup , ReadonlyWord > ;
150+ readonly variableResolved : ReadonlyMap < AST . CapturingGroup , ReadonlyLogicalWord > ;
138151}
139152
140153// Some helper constants and types to make the parser implementation more readable
@@ -171,13 +184,15 @@ export class Parser {
171184
172185 private readonly _backRefCanReachGroupCache = new Map < AST . Backreference , boolean > ( ) ;
173186 private readonly _backRefAlwaysAfterGroupCache = new Map < AST . Backreference , boolean > ( ) ;
174- private readonly _constantResolveCache = new Map < AST . CapturingGroup , ReadonlyWord | null > ( ) ;
187+ private readonly _constantResolveCache = new Map < AST . CapturingGroup , ReadonlyLogicalWord | null > ( ) ;
175188 private readonly _groupReferencesCache = new Map < AST . CapturingGroup , AST . Backreference [ ] > ( ) ;
189+ private readonly _charSetToCharFn : CharSetToCharsFn ;
176190
177191 private constructor ( ast : RegexppAst ) {
178192 this . literal = { source : ast . pattern . raw , flags : ast . flags . raw } ;
179193 this . ast = ast ;
180194 this . maxCharacter = this . ast . flags . unicode ? UNICODE_MAXIMUM : UTF16_MAXIMUM ;
195+ this . _charSetToCharFn = createCharSetToCharsFn ( this . ast . flags ) ;
181196 }
182197
183198 /**
@@ -397,7 +412,7 @@ export class Parser {
397412 throw new Error ( "No backreferences that resolve this capturing group" ) ;
398413 }
399414
400- const words = atMostK ( iterateWords ( groupElement ) , context . maxBackreferenceWords ) ;
415+ const words = atMostK ( iterateLogicalWords ( groupElement , this . _charSetToCharFn ) , context . maxBackreferenceWords ) ;
401416 if ( words . length === 0 ) {
402417 throw new Error ( "Cannot resolve dead capturing group" ) ;
403418 }
@@ -738,14 +753,12 @@ export class Parser {
738753 return EMPTY_SET ;
739754 }
740755 }
741- private _constantResolveGroup ( element : AST . CapturingGroup , context : ParserContext ) : ReadonlyWord | null {
756+ private _constantResolveGroup ( element : AST . CapturingGroup , context : ParserContext ) : ReadonlyLogicalWord | null {
742757 const cached = this . _constantResolveCache . get ( element ) ;
743758 if ( cached !== undefined ) {
744759 return cached ;
745760 }
746761
747- let result : Word | null ;
748-
749762 const expression = this . _parseElement ( element , {
750763 ...context ,
751764 backreferences : "throw" ,
@@ -759,37 +772,24 @@ export class Parser {
759772 // if the group is constant, then all that's left will be a single alternative of only single-character
760773 // character classes
761774
762- if ( expression . alternatives . length === 1 ) {
763- const concat = expression . alternatives [ 0 ] ;
764- if (
765- concat . elements . length === 1 &&
766- concat . elements [ 0 ] . type === "CharacterClass" &&
767- concat . elements [ 0 ] . characters . isEmpty
768- ) {
775+ let words = undefined ;
776+ try {
777+ words = atMostK ( iterateLogicalWords ( expression , this . _charSetToCharFn ) , 1 ) ;
778+ } catch ( e ) {
779+ // noop
780+ }
781+
782+ let result : LogicalWord | null = null ;
783+ if ( words ) {
784+ if ( words . length === 0 ) {
769785 // since the capturing can never be matched, all backreferences to it will always be replaced with the
770786 // empty string
771787 result = [ ] ;
788+ } else if ( words . length === 1 ) {
789+ result = words [ 0 ] ;
772790 } else {
773- result = [ ] ;
774-
775- for ( const char of concat . elements ) {
776- if ( char . type === "CharacterClass" ) {
777- const charset = char . characters ;
778- if ( charset . ranges . length === 1 ) {
779- const { min, max } = charset . ranges [ 0 ] ;
780- if ( min === max ) {
781- result . push ( min ) ;
782- continue ;
783- }
784- }
785- }
786-
787- result = null ;
788- break ;
789- }
791+ throw new Error ( "More than one words were returned." ) ;
790792 }
791- } else {
792- result = null ;
793793 }
794794
795795 this . _constantResolveCache . set ( element , result ) ;
@@ -814,17 +814,20 @@ export class Parser {
814814 }
815815 private _wordToElement (
816816 source : Readonly < SourceLocation > ,
817- word : ReadonlyWord ,
817+ word : ReadonlyLogicalWord ,
818818 context : ParserContext
819819 ) : NoParent < Element > | EmptyConcat {
820820 if ( word . length === 0 ) {
821821 return EMPTY_CONCAT ;
822822 } else if ( word . length === 1 ) {
823- return context . nc . newCharClass ( source , this . _charToCharSet ( word [ 0 ] ) ) ;
823+ const char = word [ 0 ] ;
824+ const characters = char instanceof CharSet ? char : this . _charToCharSet ( char ) ;
825+ return context . nc . newCharClass ( source , characters ) ;
824826 } else {
825827 const concat = context . nc . newConcat ( source ) ;
826828 for ( const char of word ) {
827- concat . elements . push ( context . nc . newCharClass ( source , this . _charToCharSet ( char ) ) ) ;
829+ const characters = char instanceof CharSet ? char : this . _charToCharSet ( char ) ;
830+ concat . elements . push ( context . nc . newCharClass ( source , characters ) ) ;
828831 }
829832
830833 const alt = context . nc . newAlt ( source ) ;
@@ -947,7 +950,7 @@ class NodeCreator {
947950 }
948951}
949952
950- function withResolved ( context : ParserContext , group : AST . CapturingGroup , word : ReadonlyWord ) : ParserContext {
953+ function withResolved ( context : ParserContext , group : AST . CapturingGroup , word : ReadonlyLogicalWord ) : ParserContext {
951954 const variableResolved = new Map ( context . variableResolved ) ;
952955 variableResolved . set ( group , word ) ;
953956 return { ...context , variableResolved } ;
@@ -1049,8 +1052,55 @@ function removeDuplicateEmptyAlternative(element: NoParent<Expression | Alternat
10491052 } ) ;
10501053}
10511054
1052- function iterateWords ( node : NoParent < Node > ) : UnionIterable < Word > {
1053- return wordSetsToWords ( iterateWordSets ( node ) ) ;
1055+ type CharSetToCharsFn = ( charSet : CharSet ) => Iterable < LogicalChar > ;
1056+ function createCharSetToCharsFn ( flags : AST . Flags ) : CharSetToCharsFn {
1057+ if ( ! flags . ignoreCase ) {
1058+ return charSetToChars ;
1059+ } else {
1060+ const caseFolding = flags . unicode ? UnicodeCaseFolding : UTF16CaseFolding ;
1061+ const maxCharacter = flags . unicode ? UNICODE_MAXIMUM : UTF16_MAXIMUM ;
1062+
1063+ const charSetCache = new Map < readonly number [ ] , CharSet > ( ) ;
1064+
1065+ return function * charSetToLogicalChars ( charSet : CharSet ) : Iterable < LogicalChar > {
1066+ const seen = new Set < Char > ( ) ;
1067+
1068+ for ( const c of charSetToChars ( charSet ) ) {
1069+ if ( seen . has ( c ) ) {
1070+ continue ;
1071+ }
1072+
1073+ const equivalenceClass : readonly number [ ] | undefined = caseFolding [ c ] ;
1074+ if ( equivalenceClass ) {
1075+ for ( const char of equivalenceClass ) {
1076+ seen . add ( char ) ;
1077+ }
1078+
1079+ let cached = charSetCache . get ( equivalenceClass ) ;
1080+ if ( cached === undefined ) {
1081+ cached = CharSet . empty ( maxCharacter ) . union ( equivalenceClass . map ( c => ( { min : c , max : c } ) ) ) ;
1082+ charSetCache . set ( equivalenceClass , cached ) ;
1083+ }
1084+ yield cached ;
1085+ } else {
1086+ yield c ;
1087+ }
1088+ }
1089+ } ;
1090+ }
1091+ }
1092+
1093+ function * wordSetsToLogicalWords (
1094+ wordSets : Iterable < readonly CharSet [ ] > ,
1095+ charSetToChars : CharSetToCharsFn
1096+ ) : Iterable < LogicalWord > {
1097+ for ( const wordSet of wordSets ) {
1098+ yield * concatSequences ( wordSet . map ( charSetToChars ) ) ;
1099+ }
1100+ }
1101+
1102+ function iterateLogicalWords ( node : NoParent < Node > , charSetToChars : CharSetToCharsFn ) : UnionIterable < LogicalWord > {
1103+ return wordSetsToLogicalWords ( iterateWordSets ( node ) , charSetToChars ) ;
10541104}
10551105function iterateWordSets ( node : NoParent < Node > ) : UnionIterable < CharSet [ ] > {
10561106 switch ( node . type ) {
0 commit comments