From dd68947ea86d4757cd4636539ad47b71cbebb938 Mon Sep 17 00:00:00 2001 From: Robert Einhorn Date: Thu, 25 Dec 2025 23:18:25 +0100 Subject: [PATCH 1/6] Update Python 3.14.2 grammar Updated the Python 3.14.2 grammar and synchronized related lexer/parser base files. This update includes rule refinements, token adjustments, directory restructuring, and cleanup of outdated Python 3.13 artifacts to ensure full alignment with the 3.14.2 specification. Signed-off-by: Robert Einhorn --- python/pom.xml | 2 +- python/python3_13/CSharp/PythonLexerBase.cs | 802 ------------ python/python3_13/Java/PythonLexerBase.java | 684 ---------- .../python3_13/JavaScript/PythonLexerBase.js | 676 ---------- python/python3_13/Python3/PythonLexerBase.py | 557 -------- python/python3_13/README.md | 37 - .../python3_13/TypeScript/PythonLexerBase.ts | 677 ---------- python/python3_13/examples/_colorize.py | 64 - python/python3_13/examples/_compression.py | 162 --- .../python3_13/examples/_opcode_metadata.py | 343 ----- python/python3_13/examples/_pylong.py | 363 ------ .../python3_13/examples/_threading_local.py | 242 ---- python/python3_13/examples/_weakrefset.py | 205 --- python/python3_14/CSharp/PythonLexerBase.cs | 877 +++++++++++++ python/python3_14/Java/PythonLexerBase.java | 768 +++++++++++ .../python3_14/JavaScript/PythonLexerBase.js | 778 +++++++++++ python/python3_14/Python3/PythonLexerBase.py | 595 +++++++++ .../Python3_14_2_official_grammar.peg} | 62 +- .../{python3_13 => python3_14}/PythonLexer.g4 | 528 +++++--- .../PythonParser.g4 | 74 +- python/python3_14/README.md | 23 + .../python3_14/TypeScript/PythonLexerBase.ts | 779 +++++++++++ python/{python3_13 => python3_14}/changes.md | 6 + python/{python3_13 => python3_14}/desc.xml | 0 .../examples/__future__.py | 0 .../examples/__hello__.py | 0 .../examples/_aix_support.py | 0 .../examples/_android_support.py | 18 +- python/python3_14/examples/_apple_support.py | 66 + python/python3_14/examples/_ast_unparse.py | 1161 +++++++++++++++++ .../examples/_collections_abc.py | 37 +- python/python3_14/examples/_colorize.py | 355 +++++ .../examples/_compat_pickle.py | 0 .../examples/_ios_support.py | 0 .../examples/_markupbase.py | 2 +- .../python3_14/examples/_opcode_metadata.py | 371 ++++++ .../examples/_osx_support.py | 0 .../examples/_py_abc.py | 0 python/python3_14/examples/_py_warnings.py | 869 ++++++++++++ .../examples/_pydatetime.py | 206 ++- .../examples/_pydecimal.py | 92 +- .../examples/_pyio.py | 193 ++- python/python3_14/examples/_pylong.py | 729 +++++++++++ .../examples/_sitebuiltins.py | 0 .../examples/_strptime.py | 259 +++- python/{python3_13 => python3_14}/pom.xml | 4 +- 46 files changed, 8411 insertions(+), 5255 deletions(-) delete mode 100644 python/python3_13/CSharp/PythonLexerBase.cs delete mode 100644 python/python3_13/Java/PythonLexerBase.java delete mode 100644 python/python3_13/JavaScript/PythonLexerBase.js delete mode 100644 python/python3_13/Python3/PythonLexerBase.py delete mode 100644 python/python3_13/README.md delete mode 100644 python/python3_13/TypeScript/PythonLexerBase.ts delete mode 100644 python/python3_13/examples/_colorize.py delete mode 100644 python/python3_13/examples/_compression.py delete mode 100644 python/python3_13/examples/_opcode_metadata.py delete mode 100644 python/python3_13/examples/_pylong.py delete mode 100644 python/python3_13/examples/_threading_local.py delete mode 100644 python/python3_13/examples/_weakrefset.py create mode 100644 python/python3_14/CSharp/PythonLexerBase.cs create mode 100644 python/python3_14/Java/PythonLexerBase.java create mode 100644 python/python3_14/JavaScript/PythonLexerBase.js create mode 100644 python/python3_14/Python3/PythonLexerBase.py rename python/{python3_13/Python3_13_2_official_grammar.peg => python3_14/Python3_14_2_official_grammar.peg} (94%) rename python/{python3_13 => python3_14}/PythonLexer.g4 (61%) rename python/{python3_13 => python3_14}/PythonParser.g4 (93%) create mode 100644 python/python3_14/README.md create mode 100644 python/python3_14/TypeScript/PythonLexerBase.ts rename python/{python3_13 => python3_14}/changes.md (82%) rename python/{python3_13 => python3_14}/desc.xml (100%) rename python/{python3_13 => python3_14}/examples/__future__.py (100%) rename python/{python3_13 => python3_14}/examples/__hello__.py (100%) rename python/{python3_13 => python3_14}/examples/_aix_support.py (100%) rename python/{python3_13 => python3_14}/examples/_android_support.py (93%) create mode 100644 python/python3_14/examples/_apple_support.py create mode 100644 python/python3_14/examples/_ast_unparse.py rename python/{python3_13 => python3_14}/examples/_collections_abc.py (97%) create mode 100644 python/python3_14/examples/_colorize.py rename python/{python3_13 => python3_14}/examples/_compat_pickle.py (100%) rename python/{python3_13 => python3_14}/examples/_ios_support.py (100%) rename python/{python3_13 => python3_14}/examples/_markupbase.py (99%) create mode 100644 python/python3_14/examples/_opcode_metadata.py rename python/{python3_13 => python3_14}/examples/_osx_support.py (100%) rename python/{python3_13 => python3_14}/examples/_py_abc.py (100%) create mode 100644 python/python3_14/examples/_py_warnings.py rename python/{python3_13 => python3_14}/examples/_pydatetime.py (92%) rename python/{python3_13 => python3_14}/examples/_pydecimal.py (98%) rename python/{python3_13 => python3_14}/examples/_pyio.py (94%) create mode 100644 python/python3_14/examples/_pylong.py rename python/{python3_13 => python3_14}/examples/_sitebuiltins.py (100%) rename python/{python3_13 => python3_14}/examples/_strptime.py (75%) rename python/{python3_13 => python3_14}/pom.xml (95%) diff --git a/python/pom.xml b/python/pom.xml index c99982bdfa..d0deb48fa2 100644 --- a/python/pom.xml +++ b/python/pom.xml @@ -16,6 +16,6 @@ python2 python3 python2_7_18 - python3_13 + python3_14 diff --git a/python/python3_13/CSharp/PythonLexerBase.cs b/python/python3_13/CSharp/PythonLexerBase.cs deleted file mode 100644 index bbaf1bc678..0000000000 --- a/python/python3_13/CSharp/PythonLexerBase.cs +++ /dev/null @@ -1,802 +0,0 @@ -/* -The MIT License (MIT) -Copyright (c) 2021 Robert Einhorn - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. - */ - -/* - * Project : Python Indent/Dedent handler for ANTLR4 grammars - * - * Developed by : Robert Einhorn - */ - -#nullable enable -using Antlr4.Runtime; -using System; -using System.Text; -using System.Text.RegularExpressions; -using System.IO; -using System.Collections.Generic; - -[assembly: CLSCompliant(true)] - -public abstract class PythonLexerBase : Lexer -{ - // A stack that keeps track of the indentation lengths - private Stack indentLengthStack = new(); - // A list where tokens are waiting to be loaded into the token stream - private LinkedList pendingTokens = new(); - - // last pending token type - private int previousPendingTokenType; - private int lastPendingTokenTypeFromDefaultChannel; - - // The amount of opened parentheses, square brackets, or curly braces - private int opened; - // The amount of opened parentheses and square brackets in the current lexer mode - private Stack paren_or_bracket_openedStack = new(); - // A stack that stores expression(s) between braces in fstring - private Stack braceExpressionStack = new(); - private string prevBraceExpression = ""; - - // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime) - private int curLexerMode; - // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime) - private Stack lexerModeStack = new(); - - private bool wasSpaceIndentation; - private bool wasTabIndentation; - private bool wasIndentationMixedWithSpacesAndTabs; - - private IToken curToken = null!; // current (under processing) token - private IToken ffgToken = null!; // following (look ahead) token - - private const int INVALID_LENGTH = -1; - private const string ERR_TXT = " ERROR: "; - - protected PythonLexerBase(ICharStream input) : base(input) - { - } - - protected PythonLexerBase(ICharStream input, TextWriter output, TextWriter errorOutput) : base(input, output, errorOutput) - { - } - - public override IToken NextToken() // reading the intStream stream until a return EOF - { - this.CheckNextToken(); - IToken firstPendingToken = this.pendingTokens.First!.Value; - this.pendingTokens.RemoveFirst(); - return firstPendingToken; // add the queued token to the token stream - } - - public override void Reset() - { - this.Init(); - base.Reset(); - } - - private void Init() - { - this.indentLengthStack = new(); - this.pendingTokens = new(); - this.previousPendingTokenType = 0; - this.lastPendingTokenTypeFromDefaultChannel = 0; - this.opened = 0; - this.paren_or_bracket_openedStack = new(); - this.braceExpressionStack = new(); - this.prevBraceExpression = ""; - this.curLexerMode = 0; - this.lexerModeStack = new(); - this.wasSpaceIndentation = false; - this.wasTabIndentation = false; - this.wasIndentationMixedWithSpacesAndTabs = false; - this.curToken = null!; - this.ffgToken = null!; - } - - private void CheckNextToken() - { - if (this.previousPendingTokenType == TokenConstants.EOF) - return; - - if (this.indentLengthStack.Count == 0) // We're at the first token - { - this.InsertENCODINGtoken(); - this.SetCurrentAndFollowingTokens(); - this.HandleStartOfInput(); - } - else - { - this.SetCurrentAndFollowingTokens(); - } - - - switch (this.curToken.Type) - { - case PythonLexer.NEWLINE: - this.HandleNEWLINEtoken(); - break; - case PythonLexer.LPAR: - case PythonLexer.LSQB: - case PythonLexer.LBRACE: - this.opened++; - this.AddPendingToken(this.curToken); - break; - case PythonLexer.RPAR: - case PythonLexer.RSQB: - case PythonLexer.RBRACE: - this.opened--; - this.AddPendingToken(this.curToken); - break; - case PythonLexer.FSTRING_MIDDLE: - this.HandleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field - this.AddPendingToken(this.curToken); - break; - case PythonLexer.COLONEQUAL: - this.HandleCOLONEQUALtokenInFString(); - break; - case PythonLexer.ERRORTOKEN: - this.ReportLexerError("token recognition error at: '" + this.curToken.Text + "'"); - this.AddPendingToken(this.curToken); - break; - case TokenConstants.EOF: - this.HandleEOFtoken(); - break; - default: - this.AddPendingToken(this.curToken); - break; - } - this.HandleFORMAT_SPECIFICATION_MODE(); - } - - private void SetCurrentAndFollowingTokens() - { - this.curToken = this.ffgToken == null ? - base.NextToken() : - this.ffgToken; - - this.CheckCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)! - - this.ffgToken = this.curToken.Type == TokenConstants.EOF ? - this.curToken : - base.NextToken(); - } - - private void InsertENCODINGtoken() // https://peps.python.org/pep-0263/ - { - var lineBuilder = new StringBuilder(); - var encodingName = ""; - var lineCount = 0; - var ws_commentPattern = new Regex("^[ \t\f]*(#.*)?$"); - var intStream = this.InputStream; - var size = intStream.Size; - - intStream.Seek(0); - for (int i = 0; i < size; i++) - { - char c = (char)intStream.LA(i + 1); - lineBuilder.Append(c); - - if (c == '\n' || i == size - 1) - { - string line = lineBuilder.ToString().Replace("\r", "").Replace("\n", ""); - if (ws_commentPattern.IsMatch(line)) // WS* + COMMENT? found - { - encodingName = GetEncodingName(line); - if (encodingName != "") - { - break; // encoding found - } - } - else - { - break; // statement or backslash found (line is not empty, not whitespace(s), not comment) - } - - lineCount++; - if (lineCount >= 2) - { - break; // check only the first two lines - } - lineBuilder.Clear(); - } - } - - if (encodingName == "") - { - encodingName = "utf-8"; // default Python source code encoding - } - - var encodingToken = new CommonToken(PythonLexer.ENCODING, encodingName); - encodingToken.Channel = TokenConstants.HiddenChannel; - encodingToken.StartIndex = 0; - encodingToken.StopIndex = 0; - encodingToken.Line = 0; - encodingToken.Column = -1; - AddPendingToken(encodingToken); - } - - private static string GetEncodingName(string commentText) // https://peps.python.org/pep-0263/#defining-the-encoding - { - var encodingCommentPattern = new Regex("^[ \\t\\f]*#.*?coding[:=][ \\t]*([-_.a-zA-Z0-9]+)"); - var match = encodingCommentPattern.Match(commentText); - return match.Success ? match.Groups[1].Value : string.Empty; - } - - // initialize the _indentLengths - // hide the leading NEWLINE token(s) - // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel - // insert a leading INDENT token if necessary - private void HandleStartOfInput() - { - // initialize the stack with a default 0 indentation length - this.indentLengthStack.Push(0); // this will never be popped off - while (this.curToken.Type != TokenConstants.EOF) - { - if (this.curToken.Channel == TokenConstants.DefaultChannel) - { - if (this.curToken.Type == PythonLexer.NEWLINE) - { - // all the NEWLINE tokens must be ignored before the first statement - this.HideAndAddPendingToken(this.curToken); - } - else - { // We're at the first statement - this.InsertLeadingIndentToken(); - return; // continue the processing of the current token with CheckNextToken() - } - } - else - { - this.AddPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING, or COMMENT token - } - this.SetCurrentAndFollowingTokens(); - } // continue the processing of the EOF token with CheckNextToken() - } - - private void InsertLeadingIndentToken() - { - if (this.previousPendingTokenType == PythonLexer.WS) - { - IToken prevToken = this.pendingTokens.Last!.Value; - if (this.GetIndentationLength(prevToken.Text) != 0) // there is an "indentation" before the first statement - { - const string errMsg = "first statement indented"; - this.ReportLexerError(errMsg); - // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - this.CreateAndAddPendingToken(PythonLexer.INDENT, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.curToken); - } - } - } - - private void HandleNEWLINEtoken() - { - if (this.lexerModeStack.Count > 0) - { - this.AddPendingToken(this.curToken); - } - else if (this.opened > 0) - { - // We're in an implicit line joining, ignore the current NEWLINE token - this.HideAndAddPendingToken(this.curToken); - } - else - { - IToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token - bool isLookingAhead = this.ffgToken.Type == PythonLexer.WS; - if (isLookingAhead) - { - this.SetCurrentAndFollowingTokens(); // set the next two tokens - } - - switch (this.ffgToken.Type) - { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment - this.HideAndAddPendingToken(nlToken); - if (isLookingAhead) - { - this.AddPendingToken(this.curToken); // WS token - } - break; - default: - this.AddPendingToken(nlToken); - if (isLookingAhead) - { // We're on a whitespace(s) followed by a statement - int indentationLength = this.ffgToken.Type == TokenConstants.EOF ? - 0 : - this.GetIndentationLength(this.curToken.Text); - - if (indentationLength != PythonLexerBase.INVALID_LENGTH) - { - this.AddPendingToken(this.curToken); // WS token - this.InsertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) - } - else - { - this.ReportError("inconsistent use of tabs and spaces in indentation"); - } - } - else - { - // We're at a newline followed by a statement (there is no whitespace before the statement) - this.InsertIndentOrDedentToken(0); // may insert DEDENT token(s) - } - break; - } - } - } - - private void InsertIndentOrDedentToken(int indentLength) - { - int prevIndentLength = this.indentLengthStack.Peek(); - if (indentLength > prevIndentLength) - { - this.CreateAndAddPendingToken(PythonLexer.INDENT, TokenConstants.DefaultChannel, null, this.ffgToken); - this.indentLengthStack.Push(indentLength); - } - else - { - while (indentLength < prevIndentLength) - { // more than 1 DEDENT token may be inserted into the token stream - this.indentLengthStack.Pop(); - prevIndentLength = this.indentLengthStack.Peek(); - if (indentLength <= prevIndentLength) - { - this.CreateAndAddPendingToken(PythonLexer.DEDENT, TokenConstants.DefaultChannel, null, this.ffgToken); - } - else - { - this.ReportError("inconsistent dedent"); - } - } - } - } - - private void CheckCurToken() - { - switch (this.curToken.Type) - { - case PythonLexer.FSTRING_START: - this.SetLexerModeByFSTRING_STARTtoken(); - return; - case PythonLexer.FSTRING_MIDDLE: - this.HandleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field - if (this.curToken.Type == PythonLexer.FSTRING_MIDDLE) - return; // No curToken exchange happened - break; - case PythonLexer.FSTRING_END: - this.PopLexerMode(); - return; - default: - if (this.lexerModeStack.Count == 0) - return; // Not in fstring mode - break; - } - - switch (this.curToken.Type) - { - case PythonLexer.NEWLINE: - // append the current brace expression with the current newline - this.AppendToBraceExpression(this.curToken.Text); - var ctkn = new CommonToken(this.curToken); - ctkn.Channel = TokenConstants.HiddenChannel; - this.curToken = ctkn; - break; - case PythonLexer.LBRACE: - // the outermost brace expression cannot be a dictionary comprehension or a set comprehension - this.braceExpressionStack.Push("{"); - this.paren_or_bracket_openedStack.Push(0); - this.PushLexerMode(Lexer.DEFAULT_MODE); - break; - case PythonLexer.LPAR: - case PythonLexer.LSQB: - // append the current brace expression with a "(" or a "[" - this.AppendToBraceExpression(this.curToken.Text); - // https://peps.python.org/pep-0498/#lambdas-inside-expressions - this.IncrementBraceStack(); - break; - case PythonLexer.RPAR: - case PythonLexer.RSQB: - // append the current brace expression with a ")" or a "]" - this.AppendToBraceExpression(this.curToken.Text); - this.DecrementBraceStack(); - break; - case PythonLexer.COLON: - case PythonLexer.COLONEQUAL: - // append the current brace expression with a ":" or a ":=" - this.AppendToBraceExpression(this.curToken.Text); - this.SetLexerModeByCOLONorCOLONEQUALtoken(); - break; - case PythonLexer.RBRACE: - this.SetLexerModeAfterRBRACEtoken(); - break; - default: - // append the current brace expression with the current token text - this.AppendToBraceExpression(this.curToken.Text); - break; - } - } - - private void AppendToBraceExpression(string text) - { - this.braceExpressionStack.Push(this.braceExpressionStack.Pop() + text); - } - - private void IncrementBraceStack() - { // increment the last element (peek() + 1) - this.paren_or_bracket_openedStack.Push(this.paren_or_bracket_openedStack.Pop() + 1); - } - - private void DecrementBraceStack() - { // decrement the last element (peek() - 1) - this.paren_or_bracket_openedStack.Push(this.paren_or_bracket_openedStack.Pop() - 1); - } - - private void SetLexerModeAfterRBRACEtoken() - { - switch (this.curLexerMode) - { - case Lexer.DEFAULT_MODE: - this.PopLexerMode(); - this.PopByBRACE(); - break; - case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE: - this.PopLexerMode(); - this.PopLexerMode(); - this.PopByBRACE(); - break; - default: - this.ReportLexerError("f-string: single '}' is not allowed"); - break; - } - } - - private void SetLexerModeByFSTRING_STARTtoken() - { - string text = this.curToken.Text.ToLower(); - var modeMap = new Dictionary - { - { "f'", PythonLexer.SQ1__FSTRING_MODE }, - { "rf'", PythonLexer.SQ1R_FSTRING_MODE }, - { "fr'", PythonLexer.SQ1R_FSTRING_MODE }, - { "f\"", PythonLexer.DQ1__FSTRING_MODE }, - { "rf\"", PythonLexer.DQ1R_FSTRING_MODE }, - { "fr\"", PythonLexer.DQ1R_FSTRING_MODE }, - { "f'''", PythonLexer.SQ3__FSTRING_MODE }, - { "rf'''", PythonLexer.SQ3R_FSTRING_MODE }, - { "fr'''", PythonLexer.SQ3R_FSTRING_MODE }, - { "f\"\"\"", PythonLexer.DQ3__FSTRING_MODE }, - { "rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE }, - { "fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE } - }; - - if (modeMap.TryGetValue(text, out int mode)) - { - this.PushLexerMode(mode); - } - } - - private void SetLexerModeByCOLONorCOLONEQUALtoken() - { - if (this.paren_or_bracket_openedStack.Peek() == 0) - { - // COLONEQUAL token will be replaced with a COLON token in CheckNextToken() - switch (this.lexerModeStack.Peek()) - { // check the previous lexer mode (the current is DEFAULT_MODE) - case PythonLexer.SQ1__FSTRING_MODE: - case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE: - this.PushLexerMode(PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ1R_FSTRING_MODE: - case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE: - this.PushLexerMode(PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ1__FSTRING_MODE: - case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE: - this.PushLexerMode(PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ1R_FSTRING_MODE: - case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE: - this.PushLexerMode(PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ3__FSTRING_MODE: - case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE: - this.PushLexerMode(PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ3R_FSTRING_MODE: - case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE: - this.PushLexerMode(PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ3__FSTRING_MODE: - case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE: - this.PushLexerMode(PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ3R_FSTRING_MODE: - case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE: - this.PushLexerMode(PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - } - } - } - - private void PopByBRACE() - { - this.paren_or_bracket_openedStack.Pop(); - this.prevBraceExpression = this.braceExpressionStack.Pop() + "}"; - if (this.braceExpressionStack.Count > 0) - { - // append the current brace expression with the previous brace expression - this.braceExpressionStack.Push(this.braceExpressionStack.Pop() + this.prevBraceExpression); - } - - } - - private void HandleFSTRING_MIDDLEtokenWithDoubleBrace() - { - // replace the trailing double brace with a single brace and insert a hidden brace token - switch (this.GetLastTwoCharsOfTheCurTokenText()) - { - case "{{": - this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", TokenConstants.HiddenChannel); - break; - case "}}": - this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", TokenConstants.HiddenChannel); - break; - } - } - - private void HandleFSTRING_MIDDLEtokenWithQuoteAndLBrace() - { - // replace the trailing quote + left_brace with a quote and insert an LBRACE token - // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token - switch (this.GetLastTwoCharsOfTheCurTokenText()) - { - case "\"{": - case "'{": - case "\\{": - this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", TokenConstants.DefaultChannel); - break; - } - } - - private string GetLastTwoCharsOfTheCurTokenText() - { - string curTokenText = this.curToken.Text; - return curTokenText.Length >= 2 ? curTokenText.Substring(curTokenText.Length - 2) : curTokenText; - } - - private void TrimLastCharAddPendingTokenSetCurToken(int type, string text, int channel) - { - // trim the last char and add the modified curToken to the pendingTokens stack - string curTokenText = this.curToken.Text; - string tokenTextWithoutLastChar = curTokenText.Substring(0, curTokenText.Length - 1); - var ctkn = new CommonToken(this.curToken); - ctkn.Text = tokenTextWithoutLastChar; - ctkn.StopIndex = ctkn.StopIndex - 1; - this.AddPendingToken(ctkn); - - this.CreateNewCurToken(type, text, channel); // set curToken - } - - private void HandleCOLONEQUALtokenInFString() - { - if (this.lexerModeStack.Count > 0 && - this.paren_or_bracket_openedStack.Peek() == 0) - { - // In fstring a colonequal (walrus operator) can only be used in parentheses - // Not in parentheses, replace COLONEQUAL token with COLON as format specifier - // and insert the equal symbol to the following FSTRING_MIDDLE token - var ctkn = new CommonToken(this.curToken); - ctkn.Type = PythonLexer.COLON; - ctkn.Text = ":"; - ctkn.StopIndex = ctkn.StartIndex; - this.curToken = ctkn; - if (this.ffgToken.Type == PythonLexer.FSTRING_MIDDLE) - { - ctkn = new CommonToken(this.ffgToken); - ctkn.Text = "=" + ctkn.Text; - ctkn.StartIndex -= 1; - ctkn.Column -= 1; - this.ffgToken = ctkn; - } - else - { - this.AddPendingToken(this.curToken); - this.CreateNewCurToken(PythonLexer.FSTRING_MIDDLE, "=", TokenConstants.DefaultChannel); - } - } - this.AddPendingToken(this.curToken); - } - - private void CreateNewCurToken(int type, string text, int channel) - { - var ctkn = new CommonToken(this.curToken); - ctkn.Type = type; - ctkn.Text = text; - ctkn.Channel = channel; - ctkn.Column += 1; - ctkn.StartIndex += 1; - ctkn.StopIndex = ctkn.StartIndex; - this.curToken = ctkn; - } - - private void PushLexerMode(int mode) - { - this.PushMode(mode); - this.lexerModeStack.Push(this.curLexerMode); - this.curLexerMode = mode; - } - - private void PopLexerMode() - { - this.PopMode(); - this.curLexerMode = this.lexerModeStack.Pop(); - } - - private void HandleFORMAT_SPECIFICATION_MODE() - { - if (this.lexerModeStack.Count > 0 - && this.ffgToken.Type == PythonLexer.RBRACE) - { - // insert an empty FSTRING_MIDDLE token instead of the missing format specification - switch (this.curToken.Type) - { - case PythonLexer.COLON: - this.CreateAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, TokenConstants.DefaultChannel, "", this.ffgToken); - break; - case PythonLexer.RBRACE: - // only if the previous brace expression is not a dictionary comprehension or set comprehension - if (!IsDictionaryComprehensionOrSetComprehension(this.prevBraceExpression)) - { - this.CreateAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, TokenConstants.DefaultChannel, "", this.ffgToken); - } - break; - } - } - } - - private static bool IsDictionaryComprehensionOrSetComprehension(string code) - { - var inputStream = CharStreams.fromString(code); - var lexer = new PythonLexer(inputStream); - var tokenStream = new CommonTokenStream(lexer); - var parser = new PythonParser(tokenStream); - - // Disable error listeners to suppress console output - lexer.RemoveErrorListeners(); - parser.RemoveErrorListeners(); - - parser.dictcomp(); // Try parsing as dictionary comprehension - if (parser.NumberOfSyntaxErrors == 0) - return true; - - parser = new PythonParser(tokenStream); - tokenStream.Seek(0); - parser.RemoveErrorListeners(); - parser.setcomp(); // Try parsing as set comprehension - return parser.NumberOfSyntaxErrors == 0; - } - - private void InsertTrailingTokens() - { - switch (this.lastPendingTokenTypeFromDefaultChannel) - { - case PythonLexer.NEWLINE: - case PythonLexer.DEDENT: - break; // no trailing NEWLINE token is needed - default: - // insert an extra trailing NEWLINE token that serves as the end of the last statement - this.CreateAndAddPendingToken(PythonLexer.NEWLINE, TokenConstants.DefaultChannel, null, this.ffgToken); // ffgToken is EOF - break; - } - this.InsertIndentOrDedentToken(0); // Now insert as many trailing DEDENT tokens as needed - } - - private void HandleEOFtoken() - { - if (this.lastPendingTokenTypeFromDefaultChannel > 0) - { // there was a statement in the intStream (leading NEWLINE tokens are hidden) - this.InsertTrailingTokens(); - } - this.AddPendingToken(this.curToken); - } - - private void HideAndAddPendingToken(IToken tkn) - { - var ctkn = new CommonToken(tkn); - ctkn.Channel = TokenConstants.HiddenChannel; - this.AddPendingToken(ctkn); - } - - private void CreateAndAddPendingToken(int ttype, int channel, string? text, IToken sampleToken) - { - var ctkn = new CommonToken(sampleToken); - ctkn.Type = ttype; - ctkn.Channel = channel; - ctkn.StopIndex = sampleToken.StartIndex - 1; - ctkn.Text = text ?? "<" + this.Vocabulary.GetSymbolicName(ttype) + ">"; - - this.AddPendingToken(ctkn); - } - - private void AddPendingToken(IToken tkn) - { - // save the last pending token type because the pendingTokens list can be empty by the nextToken() - this.previousPendingTokenType = tkn.Type; - if (tkn.Channel == TokenConstants.DefaultChannel) - { - this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; - } - this.pendingTokens.AddLast(tkn); - } - - private int GetIndentationLength(string indentText) // the indentText may contain spaces, tabs or form feeds - { - const int TAB_LENGTH = 8; // the standard number of spaces to replace a tab with spaces - int length = 0; - foreach (char ch in indentText) - { - switch (ch) - { - case ' ': - this.wasSpaceIndentation = true; - length += 1; - break; - case '\t': - this.wasTabIndentation = true; - length += TAB_LENGTH - (length % TAB_LENGTH); - break; - case '\f': // form feed - length = 0; - break; - } - } - - if (this.wasTabIndentation && this.wasSpaceIndentation) - { - if (!this.wasIndentationMixedWithSpacesAndTabs) - { - this.wasIndentationMixedWithSpacesAndTabs = true; - length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent - } - } - return length; - } - - private void ReportLexerError(string errMsg) - { - this.ErrorListenerDispatch.SyntaxError(this.ErrorOutput, this, this.curToken.Type, this.curToken.Line, this.curToken.Column, " LEXER" + PythonLexerBase.ERR_TXT + errMsg, null); - } - - private void ReportError(string errMsg) - { - this.ReportLexerError(errMsg); - - // the ERRORTOKEN will raise an error in the parser - this.CreateAndAddPendingToken(PythonLexer.ERRORTOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken); - } -} diff --git a/python/python3_13/Java/PythonLexerBase.java b/python/python3_13/Java/PythonLexerBase.java deleted file mode 100644 index ab5eb88751..0000000000 --- a/python/python3_13/Java/PythonLexerBase.java +++ /dev/null @@ -1,684 +0,0 @@ -/* -The MIT License (MIT) -Copyright (c) 2021 Robert Einhorn - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. - */ - -/* - * - * Project : Python Indent/Dedent handler for ANTLR4 grammars - * - * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com - * - */ - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.antlr.v4.runtime.*; - -public abstract class PythonLexerBase extends Lexer { - // A stack that keeps track of the indentation lengths - private Deque indentLengthStack; - // A list where tokens are waiting to be loaded into the token stream - private Deque pendingTokens; - - // last pending token type - private int previousPendingTokenType; - private int lastPendingTokenTypeFromDefaultChannel; - - // The amount of opened parentheses, square brackets or curly braces - private int opened; - // The amount of opened parentheses and square brackets in the current lexer mode - private Deque paren_or_bracket_openedStack; - // A stack that stores expression(s) between braces in fstring - private Deque braceExpressionStack; - private String prevBraceExpression; - - // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime) - private int curLexerMode; - // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime) - private Deque lexerModeStack; - - private boolean wasSpaceIndentation; - private boolean wasTabIndentation; - private boolean wasIndentationMixedWithSpacesAndTabs; - - private Token curToken; // current (under processing) token - private Token ffgToken; // following (look ahead) token - - private final int INVALID_LENGTH = -1; - private final String ERR_TXT = " ERROR: "; - - protected PythonLexerBase(CharStream input) { - super(input); - this.init(); - } - - @Override - public Token nextToken() { // reading the input stream until a return EOF - this.checkNextToken(); - return this.pendingTokens.pollFirst(); // add the queued token to the token stream - } - - @Override - public void reset() { - this.init(); - super.reset(); - } - - private void init() { - this.indentLengthStack = new ArrayDeque<>(); - this.pendingTokens = new ArrayDeque<>(); - this.previousPendingTokenType = 0; - this.lastPendingTokenTypeFromDefaultChannel = 0; - this.opened = 0; - this.paren_or_bracket_openedStack = new ArrayDeque<>(); - this.braceExpressionStack = new ArrayDeque<>(); - this.prevBraceExpression = ""; - this.curLexerMode = 0; - this.lexerModeStack = new ArrayDeque<>(); - this.wasSpaceIndentation = false; - this.wasTabIndentation = false; - this.wasIndentationMixedWithSpacesAndTabs = false; - this.curToken = null; - this.ffgToken = null; - } - - private void checkNextToken() { - if (this.previousPendingTokenType == Token.EOF) - return; - - if (this.indentLengthStack.isEmpty()) { // We're at the first token - this.insertENCODINGtoken(); - this.setCurrentAndFollowingTokens(); - this.handleStartOfInput(); - } else { - this.setCurrentAndFollowingTokens(); - } - - switch (this.curToken.getType()) { - case PythonLexer.NEWLINE: - this.handleNEWLINEtoken(); - break; - case PythonLexer.LPAR: - case PythonLexer.LSQB: - case PythonLexer.LBRACE: - this.opened++; - this.addPendingToken(this.curToken); - break; - case PythonLexer.RPAR: - case PythonLexer.RSQB: - case PythonLexer.RBRACE: - this.opened--; - this.addPendingToken(this.curToken); - break; - case PythonLexer.FSTRING_MIDDLE: - this.handleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field - this.addPendingToken(this.curToken); - break; - case PythonLexer.COLONEQUAL: - this.handleCOLONEQUALtokenInFString(); - break; - case PythonLexer.ERRORTOKEN: - this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'"); - this.addPendingToken(this.curToken); - break; - case Token.EOF: - this.handleEOFtoken(); - break; - default: - this.addPendingToken(this.curToken); - } - this.handleFORMAT_SPECIFICATION_MODE(); - } - - private void setCurrentAndFollowingTokens() { - this.curToken = this.ffgToken == null ? - super.nextToken() : - this.ffgToken; - - this.checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)! - - this.ffgToken = this.curToken.getType() == Token.EOF ? - this.curToken : - super.nextToken(); - } - - private void insertENCODINGtoken() { // https://peps.python.org/pep-0263/ - StringBuilder lineBuilder = new StringBuilder(); - String encodingName = ""; - int lineCount = 0; - final Pattern ws_commentPattern = Pattern.compile("^[ \\t\\f]*(#.*)?$"); - final CharStream charStream = this.getInputStream(); - final int size = charStream.size(); - - charStream.seek(0); - for (int i = 0; i < size; i++) { - char c = (char) charStream.LA(i + 1); - lineBuilder.append(c); - - if (c == '\n' || i == size - 1) { - String line = lineBuilder.toString().replace("\r", "").replace("\n", ""); - if (ws_commentPattern.matcher(line).find()) { // WS* + COMMENT? found - encodingName = getEncodingName(line); - if (!encodingName.isEmpty()) { - break; // encoding found - } - } else { - break; // statement or backslash found (line is not empty, not whitespace(s), not comment) - } - - lineCount++; - if (lineCount >= 2) { - break; // check only the first two lines - } - lineBuilder = new StringBuilder(); - } - } - - if (encodingName.isEmpty()) { - encodingName = "utf-8"; // default Python source code encoding - } - - final CommonToken encodingToken = new CommonToken(PythonLexer.ENCODING, encodingName); - encodingToken.setChannel(Token.HIDDEN_CHANNEL); - this.addPendingToken(encodingToken); - } - - private String getEncodingName(final String commentText) { // https://peps.python.org/pep-0263/#defining-the-encoding - final Pattern encodingCommentPattern = Pattern.compile("^[ \\t\\f]*#.*?coding[:=][ \\t]*([-_.a-zA-Z0-9]+)"); - final Matcher matcher = encodingCommentPattern.matcher(commentText); - return matcher.find() ? matcher.group(1) : ""; - } - - // initialize the indentLengthStack - // hide the leading NEWLINE token(s) - // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel - // insert a leading INDENT token if necessary - private void handleStartOfInput() { - // initialize the stack with a default 0 indentation length - this.indentLengthStack.push(0); // this will never be popped off - while (this.curToken.getType() != Token.EOF) { - if (this.curToken.getChannel() == Token.DEFAULT_CHANNEL) { - if (this.curToken.getType() == PythonLexer.NEWLINE) { - // all the NEWLINE tokens must be ignored before the first statement - this.hideAndAddPendingToken(this.curToken); - } else { // We're at the first statement - this.insertLeadingIndentToken(); - return; // continue the processing of the current token with checkNextToken() - } - } else { - this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token - } - this.setCurrentAndFollowingTokens(); - } - // continue the processing of the EOF token with checkNextToken() - } - - private void insertLeadingIndentToken() { - if (this.previousPendingTokenType == PythonLexer.WS) { - Token prevToken = this.pendingTokens.peekLast(); // WS token - if (this.getIndentationLength(prevToken.getText()) != 0) { // there is an "indentation" before the first statement - final String errMsg = "first statement indented"; - this.reportLexerError(errMsg); - // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken); - } - } - } - - private void handleNEWLINEtoken() { - if (!this.lexerModeStack.isEmpty()) { // for multi line fstring literals - this.addPendingToken(this.curToken); - } else if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token - this.hideAndAddPendingToken(this.curToken); - } else { - final Token nlToken = new CommonToken(this.curToken); // save the current NEWLINE token - final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS; - if (isLookingAhead) { - this.setCurrentAndFollowingTokens(); // set the next two tokens - } - - switch (this.ffgToken.getType()) { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment - this.hideAndAddPendingToken(nlToken); - if (isLookingAhead) { - this.addPendingToken(this.curToken); // WS token - } - break; - default: - this.addPendingToken(nlToken); - if (isLookingAhead) { // We're on a whitespace(s) followed by a statement - final int indentationLength = this.ffgToken.getType() == Token.EOF ? - 0 : - this.getIndentationLength(this.curToken.getText()); - - if (indentationLength != this.INVALID_LENGTH) { - this.addPendingToken(this.curToken); // WS token - this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) - } else { - this.reportError("inconsistent use of tabs and spaces in indentation"); - } - } else { // We're at a newline followed by a statement (there is no whitespace before the statement) - this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) - } - } - } - } - - private void insertIndentOrDedentToken(final int indentLength) { - int prevIndentLength = this.indentLengthStack.peek(); - if (indentLength > prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); - this.indentLengthStack.push(indentLength); - } else { - while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream - this.indentLengthStack.pop(); - prevIndentLength = this.indentLengthStack.peek(); - if (indentLength <= prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); - } else { - this.reportError("inconsistent dedent"); - } - } - } - } - - private void checkCurToken() { - switch (this.curToken.getType()) { - case PythonLexer.FSTRING_START: - this.setLexerModeByFSTRING_STARTtoken(); - return; - case PythonLexer.FSTRING_MIDDLE: - this.handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field - if (this.curToken.getType() == PythonLexer.FSTRING_MIDDLE) - return; // No curToken exchange happened - break; - case PythonLexer.FSTRING_END: - this.popLexerMode(); - return; - default: - if (this.lexerModeStack.isEmpty()) - return; // Not in fstring mode - } - - switch (this.curToken.getType()) { // the following tokens can only come from default mode (after an LBRACE in fstring) - case PythonLexer.NEWLINE: - // append the current brace expression with the current newline - this.appendToBraceExpression(this.curToken.getText()); - final CommonToken ctkn = new CommonToken(this.curToken); - ctkn.setChannel(Token.HIDDEN_CHANNEL); - this.curToken = ctkn; - break; - case PythonLexer.LBRACE: - // the outermost brace expression cannot be a dictionary comprehension or a set comprehension - this.braceExpressionStack.push("{"); - this.paren_or_bracket_openedStack.push(0); - this.pushLexerMode(Lexer.DEFAULT_MODE); - break; - case PythonLexer.LPAR: - case PythonLexer.LSQB: - // append the current brace expression with a "(" or a "[" - this.appendToBraceExpression(this.curToken.getText()); - // https://peps.python.org/pep-0498/#lambdas-inside-expressions - this.incrementBraceStack(); - break; - case PythonLexer.RPAR: - case PythonLexer.RSQB: - // append the current brace expression with a ")" or a "]" - this.appendToBraceExpression(this.curToken.getText()); - this.decrementBraceStack(); - break; - case PythonLexer.COLON: - case PythonLexer.COLONEQUAL: - // append the current brace expression with a ":" or a ":=" - this.appendToBraceExpression(this.curToken.getText()); - this.setLexerModeByCOLONorCOLONEQUALtoken(); - break; - case PythonLexer.RBRACE: - this.setLexerModeAfterRBRACEtoken(); - break; - default: - // append the current brace expression with the current token text - this.appendToBraceExpression(this.curToken.getText()); - } - } - - private void appendToBraceExpression(String text) { - this.braceExpressionStack.push(this.braceExpressionStack.pop() + text); - } - - private void incrementBraceStack() { // increment the last element (peek() + 1) - this.paren_or_bracket_openedStack.push(this.paren_or_bracket_openedStack.pop() + 1); - } - - private void decrementBraceStack() { // decrement the last element (peek() - 1) - this.paren_or_bracket_openedStack.push(this.paren_or_bracket_openedStack.pop() - 1); - } - - private void setLexerModeAfterRBRACEtoken() { - switch (this.curLexerMode) { - case Lexer.DEFAULT_MODE: - this.popLexerMode(); - this.popByBRACE(); - break; - case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE: - this.popLexerMode(); - this.popLexerMode(); - this.popByBRACE(); - break; - default: - this.reportLexerError("f-string: single '}' is not allowed"); - } - } - - private void setLexerModeByFSTRING_STARTtoken() { - final String text = this.curToken.getText().toLowerCase(); - Map modeMap = new HashMap<>(); - modeMap.put("f'", PythonLexer.SQ1__FSTRING_MODE); - modeMap.put("rf'", PythonLexer.SQ1R_FSTRING_MODE); - modeMap.put("fr'", PythonLexer.SQ1R_FSTRING_MODE); - modeMap.put("f\"", PythonLexer.DQ1__FSTRING_MODE); - modeMap.put("rf\"", PythonLexer.DQ1R_FSTRING_MODE); - modeMap.put("fr\"", PythonLexer.DQ1R_FSTRING_MODE); - modeMap.put("f'''", PythonLexer.SQ3__FSTRING_MODE); - modeMap.put("rf'''", PythonLexer.SQ3R_FSTRING_MODE); - modeMap.put("fr'''", PythonLexer.SQ3R_FSTRING_MODE); - modeMap.put("f\"\"\"", PythonLexer.DQ3__FSTRING_MODE); - modeMap.put("rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE); - modeMap.put("fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE); - - Integer mode = modeMap.get(text); - if (mode != null) { - this.pushLexerMode(mode); - } - } - - private void setLexerModeByCOLONorCOLONEQUALtoken() { - if (this.paren_or_bracket_openedStack.peek() == 0) { - // COLONEQUAL token will be replaced with a COLON token in checkNextToken() - switch (this.lexerModeStack.peek()) { // check the previous lexer mode (the current is DEFAULT_MODE) - case PythonLexer.SQ1__FSTRING_MODE: - case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ1R_FSTRING_MODE: - case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ1__FSTRING_MODE: - case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ1R_FSTRING_MODE: - case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ3__FSTRING_MODE: - case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ3R_FSTRING_MODE: - case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ3__FSTRING_MODE: - case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ3R_FSTRING_MODE: - case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - } - } - } - - private void popByBRACE() { - this.paren_or_bracket_openedStack.pop(); - this.prevBraceExpression = this.braceExpressionStack.pop() + "}"; - if (!this.braceExpressionStack.isEmpty()) { - // append the current brace expression with the previous brace expression - this.braceExpressionStack.push(this.braceExpressionStack.pop() + this.prevBraceExpression); - } - - } - - private void handleFSTRING_MIDDLEtokenWithDoubleBrace() { - // replace the trailing double brace with a single brace and insert a hidden brace token - switch (this.getLastTwoCharsOfTheCurTokenText()) { - case "{{": - this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL); - break; - case "}}": - this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL); - break; - } - } - - private void handleFSTRING_MIDDLEtokenWithQuoteAndLBrace() { - // replace the trailing quote + left_brace with a quote and insert an LBRACE token - // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token - switch (this.getLastTwoCharsOfTheCurTokenText()) { - case "\"{": - case "'{": - case "\\{": - this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL); - break; - } - } - - private String getLastTwoCharsOfTheCurTokenText() { - final String curTokenText = this.curToken.getText(); - return curTokenText.length() >= 2 ? curTokenText.substring(curTokenText.length() - 2) : curTokenText; - } - - private void trimLastCharAddPendingTokenSetCurToken(final int type, final String text, final int channel) { - // trim the last char and add the modified curToken to the pendingTokens stack - final String curTokenText = this.curToken.getText(); - final String tokenTextWithoutLastChar = curTokenText.substring(0, curTokenText.length() - 1); - final CommonToken ctkn = new CommonToken(this.curToken); - ctkn.setText(tokenTextWithoutLastChar); - ctkn.setStopIndex(ctkn.getStopIndex() - 1); - this.addPendingToken(ctkn); - - this.createNewCurToken(type, text, channel); // set curToken - } - - private void handleCOLONEQUALtokenInFString() { - if (!this.lexerModeStack.isEmpty() && - this.paren_or_bracket_openedStack.peek() == 0) { - - // In fstring a colonequal (walrus operator) can only be used in parentheses - // Not in parentheses, replace COLONEQUAL token with COLON as format specifier - // and insert the equal symbol to the following FSTRING_MIDDLE token - CommonToken ctkn = new CommonToken(this.curToken); - ctkn.setType(PythonLexer.COLON); - ctkn.setText(":"); - ctkn.setStopIndex(ctkn.getStartIndex()); - this.curToken = ctkn; - if (this.ffgToken.getType() == PythonLexer.FSTRING_MIDDLE) { - ctkn = new CommonToken(this.ffgToken); - ctkn.setText("=" + ctkn.getText()); - ctkn.setStartIndex(ctkn.getStartIndex() - 1); - ctkn.setCharPositionInLine(ctkn.getCharPositionInLine() - 1); - this.ffgToken = ctkn; - } else { - this.addPendingToken(this.curToken); - this.createNewCurToken(PythonLexer.FSTRING_MIDDLE, "=", Token.DEFAULT_CHANNEL); - } - } - this.addPendingToken(this.curToken); - } - - private void createNewCurToken(final int type, final String text, final int channel) { - final CommonToken ctkn = new CommonToken(this.curToken); - ctkn.setType(type); - ctkn.setText(text); - ctkn.setChannel(channel); - ctkn.setCharPositionInLine(ctkn.getCharPositionInLine() + 1); - ctkn.setStartIndex(ctkn.getStartIndex() + 1); - ctkn.setStopIndex(ctkn.getStartIndex()); - this.curToken = ctkn; - } - - private void pushLexerMode(final int mode) { - this.pushMode(mode); - this.lexerModeStack.push(this.curLexerMode); - this.curLexerMode = mode; - } - - private void popLexerMode() { - this.popMode(); - this.curLexerMode = this.lexerModeStack.pop(); - } - - private void handleFORMAT_SPECIFICATION_MODE() { - if (!this.lexerModeStack.isEmpty() && - this.ffgToken.getType() == PythonLexer.RBRACE) { - - // insert an empty FSTRING_MIDDLE token instead of the missing format specification - switch (this.curToken.getType()) { - case PythonLexer.COLON: - this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken); - break; - case PythonLexer.RBRACE: - // only if the previous brace expression is not a dictionary comprehension or set comprehension - if (!isDictionaryComprehensionOrSetComprehension(this.prevBraceExpression)) { - this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken); - } - break; - } - } - } - - private boolean isDictionaryComprehensionOrSetComprehension(final String code) { - final CharStream inputStream = CharStreams.fromString(code); - final PythonLexer lexer = new PythonLexer(inputStream); - final CommonTokenStream tokenStream = new CommonTokenStream(lexer); - PythonParser parser = new PythonParser(tokenStream); - - // Disable error listeners to suppress console output - lexer.removeErrorListeners(); - parser.removeErrorListeners(); - - parser.dictcomp(); // Try parsing as dictionary comprehension - if (parser.getNumberOfSyntaxErrors() == 0) - return true; - - parser = new PythonParser(tokenStream); - tokenStream.seek(0); - parser.removeErrorListeners(); - parser.setcomp(); // Try parsing as set comprehension - return parser.getNumberOfSyntaxErrors() == 0; - } - - private void insertTrailingTokens() { - switch (this.lastPendingTokenTypeFromDefaultChannel) { - case PythonLexer.NEWLINE: - case PythonLexer.DEDENT: - break; // no trailing NEWLINE token is needed - default: // insert an extra trailing NEWLINE token that serves as the end of the last statement - this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // ffgToken is EOF - } - this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed - } - - private void handleEOFtoken() { - if (this.lastPendingTokenTypeFromDefaultChannel > 0) { - // there was a statement in the input (leading NEWLINE tokens are hidden) - this.insertTrailingTokens(); - } - this.addPendingToken(this.curToken); - } - - private void hideAndAddPendingToken(final Token tkn) { - final CommonToken ctkn = new CommonToken(tkn); - ctkn.setChannel(Token.HIDDEN_CHANNEL); - this.addPendingToken(ctkn); - } - - private void createAndAddPendingToken(final int ttype, final int channel, final String text, final Token sampleToken) { - final CommonToken ctkn = new CommonToken(sampleToken); - ctkn.setType(ttype); - ctkn.setChannel(channel); - ctkn.setStopIndex(sampleToken.getStartIndex() - 1); - ctkn.setText(text == null ? - "<" + this.getVocabulary().getDisplayName(ttype) + ">" : - text); - - this.addPendingToken(ctkn); - } - - private void addPendingToken(final Token tkn) { - // save the last pending token type because the pendingTokens list can be empty by the nextToken() - this.previousPendingTokenType = tkn.getType(); - if (tkn.getChannel() == Token.DEFAULT_CHANNEL) { - this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; - } - this.pendingTokens.addLast(tkn); - } - - private int getIndentationLength(final String indentText) { // the indentText may contain spaces, tabs or form feeds - final int TAB_LENGTH = 8; // the standard number of spaces to replace a tab with spaces - int length = 0; - for (char ch : indentText.toCharArray()) { - switch (ch) { - case ' ': - this.wasSpaceIndentation = true; - length += 1; - break; - case '\t': - this.wasTabIndentation = true; - length += TAB_LENGTH - (length % TAB_LENGTH); - break; - case '\f': // form feed - length = 0; - break; - } - } - - if (this.wasTabIndentation && this.wasSpaceIndentation) { - if (!(this.wasIndentationMixedWithSpacesAndTabs)) { - this.wasIndentationMixedWithSpacesAndTabs = true; - length = this.INVALID_LENGTH; // only for the first inconsistent indent - } - } - return length; - } - - private void reportLexerError(final String errMsg) { - this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.getLine(), this.curToken.getCharPositionInLine(), " LEXER" + this.ERR_TXT + errMsg, null); - } - - private void reportError(final String errMsg) { - this.reportLexerError(errMsg); - - // the ERRORTOKEN will raise an error in the parser - this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken); - } -} diff --git a/python/python3_13/JavaScript/PythonLexerBase.js b/python/python3_13/JavaScript/PythonLexerBase.js deleted file mode 100644 index 5c08004f40..0000000000 --- a/python/python3_13/JavaScript/PythonLexerBase.js +++ /dev/null @@ -1,676 +0,0 @@ -/* -The MIT License (MIT) -Copyright (c) 2021 Robert Einhorn - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. - */ - -/* - * - * Project : Python Indent/Dedent handler for ANTLR4 grammars - * - * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com - * - */ - -import { CharStreams, CommonTokenStream, Token, CommonToken, Lexer } from "antlr4"; -import PythonLexer from "./PythonLexer.js"; -import PythonParser from "./PythonParser.js"; - -export default class PythonLexerBase extends Lexer { - constructor(input) { - super(input); - - // A stack that keeps track of the indentation lengths - this.indentLengthStack; - // A list where tokens are waiting to be loaded into the token stream - this.pendingTokens; - - // last pending token types - this.previousPendingTokenType; - this.lastPendingTokenTypeFromDefaultChannel; - - // The amount of opened parentheses, square brackets or curly braces - this.opened; - // The amount of opened parentheses and square brackets in the current lexer mode - this.paren_or_bracket_openedStack; - // A stack that stores expression(s) between braces in fstring - this.braceExpressionStack; - this.prevBraceExpression; - - // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime) - this.curLexerMode; - // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime) - this.lexerModeStack; - - this.wasSpaceIndentation; - this.wasTabIndentation; - this.wasIndentationMixedWithSpacesAndTabs; - - this.curToken; // current (under processing) token - this.ffgToken; // following (look ahead) token - - this.#init(); - } - - get #INVALID_LENGTH() { return -1; } - get #ERR_TXT() { return " ERROR: "; } - - nextToken() { // reading the input stream until a return EOF - this.#checkNextToken(); - return this.pendingTokens.shift() /* stack pollFirst() */; // add the queued token to the token stream - } - - reset() { - this.#init(); - super.reset(); - } - - #init() { - this.indentLengthStack = []; - this.pendingTokens = []; - this.previousPendingTokenType = 0; - this.lastPendingTokenTypeFromDefaultChannel = 0; - this.opened = 0; - this.paren_or_bracket_openedStack = []; - this.braceExpressionStack = []; - this.prevBraceExpression = ""; - this.curLexerMode = 0; - this.lexerModeStack = []; - this.wasSpaceIndentation = false; - this.wasTabIndentation = false; - this.wasIndentationMixedWithSpacesAndTabs = false; - this.curToken = null; - this.ffgToken = null; - } - - #checkNextToken() { - if (this.previousPendingTokenType === Token.EOF) - return; - - if (this.indentLengthStack.length === 0) { // We're at the first token - this.#insertENCODINGtoken(); - this.#setCurrentAndFollowingTokens(); - this.#handleStartOfInput(); - } else { - this.#setCurrentAndFollowingTokens(); - } - - switch (this.curToken.type) { - case PythonLexer.NEWLINE: - this.#handleNEWLINEtoken(); - break; - case PythonLexer.LPAR: - case PythonLexer.LSQB: - case PythonLexer.LBRACE: - this.opened++; - this.#addPendingToken(this.curToken); - break; - case PythonLexer.RPAR: - case PythonLexer.RSQB: - case PythonLexer.RBRACE: - this.opened--; - this.#addPendingToken(this.curToken); - break; - case PythonLexer.FSTRING_MIDDLE: - this.#handleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field - this.#addPendingToken(this.curToken); - break; - case PythonLexer.COLONEQUAL: - this.#handleCOLONEQUALtokenInFString(); - break; - case PythonLexer.ERRORTOKEN: - this.#reportLexerError(`token recognition error at: '${this.curToken.text}'`); - this.#addPendingToken(this.curToken); - break; - case Token.EOF: - this.#handleEOFtoken(); - break; - default: - this.#addPendingToken(this.curToken); - } - this.#handleFORMAT_SPECIFICATION_MODE(); - } - - #setCurrentAndFollowingTokens() { - this.curToken = this.ffgToken == undefined ? - super.nextToken() : - this.ffgToken; - - this.#checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)! - - this.ffgToken = this.curToken.type === Token.EOF ? - this.curToken : - super.nextToken(); - } - - #insertENCODINGtoken() { - let lineBuilder = []; - let encodingName = ""; - let lineCount = 0; - const ws_commentPattern = /^[ \t\f]*(#.*)?$/; - const inputStream = this.inputStream; - const size = inputStream.size; - - inputStream.seek(0); - for (let i = 0; i < size; i++) { - let c = String.fromCharCode(inputStream.LA(i + 1)); - lineBuilder.push(c); - - if (c == '\n' || i == size - 1) { - let line = lineBuilder.join("").replace("\r", "").replace("\n", ""); - if (ws_commentPattern.test(line)) { // WS* + COMMENT? found - encodingName = this.#getEncodingName(line); - if (encodingName !== "") { - break; // encoding found - } - } else { - break; // statement or backslash found (line is not empty, not whitespace, not comment) - } - - lineCount++; - if (lineCount >= 2) { - break; // check only the first two lines - } - lineBuilder = []; - } - } - - if (encodingName === "") { - encodingName = "utf-8"; // default Python source code encoding - } - - const encodingToken = new CommonToken([null, null], PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, 0, 0); - encodingToken.text = encodingName; - encodingToken.line = 0; - encodingToken.column = -1; - this.#addPendingToken(encodingToken); - } - - #getEncodingName(commentText) { // https://peps.python.org/pep-0263/#defining-the-encoding - const encodingCommentPattern = /^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)/; - const match = commentText.match(encodingCommentPattern); - return match ? match[1] : ""; - } - - // initialize the _indentLengthStack - // hide the leading NEWLINE token(s) - // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel - // insert a leading INDENT token if necessary - #handleStartOfInput() { - // initialize the stack with a default 0 indentation length - this.indentLengthStack.push(0); // this will never be popped off - while (this.curToken.type !== Token.EOF) { - if (this.curToken.channel === Token.DEFAULT_CHANNEL) { - if (this.curToken.type === PythonLexer.NEWLINE) { - // all the NEWLINE tokens must be ignored before the first statement - this.#hideAndAddPendingToken(this.curToken); - } else { // We're at the first statement - this.#insertLeadingIndentToken(); - return; // continue the processing of the current token with #checkNextToken() - } - } else { - this.#addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token - } - this.#setCurrentAndFollowingTokens(); - } // continue the processing of the EOF token with #checkNextToken() - } - - #insertLeadingIndentToken() { - if (this.previousPendingTokenType === PythonLexer.WS) { - const prevToken = this.pendingTokens.at(- 1); /* stack peekLast() */ // WS token - if (this.#getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement - const errMsg = "first statement indented"; - this.#reportLexerError(errMsg); - // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.curToken); - } - } - } - - #handleNEWLINEtoken() { - if (this.lexerModeStack.length > 0) { - this.#addPendingToken(this.curToken); - } else if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token - this.#hideAndAddPendingToken(this.curToken); - } else { - const nlToken = this.curToken.clone(); // save the current NEWLINE token - const isLookingAhead = this.ffgToken.type === PythonLexer.WS; - if (isLookingAhead) { - this.#setCurrentAndFollowingTokens(); // set the next two tokens - } - - switch (this.ffgToken.type) { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment - this.#hideAndAddPendingToken(nlToken); - if (isLookingAhead) { - this.#addPendingToken(this.curToken); // WS token - } - break; - default: - this.#addPendingToken(nlToken); - if (isLookingAhead) { // We're on a whitespace(s) followed by a statement - const indentationLength = this.ffgToken.type === Token.EOF ? - 0 : - this.#getIndentationLength(this.curToken.text); - - if (indentationLength !== this.#INVALID_LENGTH) { - this.#addPendingToken(this.curToken); // WS token - this.#insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) - } else { - this.#reportError("inconsistent use of tabs and spaces in indentation"); - } - } else { // We're at a newline followed by a statement (there is no whitespace before the statement) - this.#insertIndentOrDedentToken(0); // may insert DEDENT token(s) - } - } - } - } - - #insertIndentOrDedentToken(curIndentLength) { - let prevIndentLength = this.indentLengthStack.at(-1) /* peek() */; - if (curIndentLength > prevIndentLength) { - this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); - this.indentLengthStack.push(curIndentLength); - } else { - while (curIndentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream - this.indentLengthStack.pop(); - prevIndentLength = this.indentLengthStack.at(-1) /* peek() */; - if (curIndentLength <= prevIndentLength) { - this.#createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken); - } else { - this.#reportError("inconsistent dedent"); - } - } - } - } - - #checkCurToken() { - switch (this.curToken.type) { - case PythonLexer.FSTRING_START: - this.#setLexerModeByFSTRING_STARTtoken(); - return; - case PythonLexer.FSTRING_MIDDLE: - this.#handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field - if (this.curToken.type === PythonLexer.FSTRING_MIDDLE) { - return; // No curToken exchange happened - } - break; - case PythonLexer.FSTRING_END: - this.#popLexerMode(); - return; - default: - if (this.lexerModeStack.length === 0) { - return; // Not in fstring mode - } - } - - switch (this.curToken.type) { // the following tokens can only come from default mode (after an LBRACE in fstring) - case PythonLexer.NEWLINE: - // append the current brace expression with the current newline - this.#appendToBraceExpression(this.curToken.text) - this.curToken.channel = Token.HIDDEN_CHANNEL; - break; - case PythonLexer.LBRACE: - // the outermost brace expression cannot be a dictionary comprehension or a set comprehension - this.braceExpressionStack.push("{"); - this.paren_or_bracket_openedStack.push(0); - this.#pushLexerMode(Lexer.DEFAULT_MODE); - break; - case PythonLexer.LPAR: - case PythonLexer.LSQB: - // append the current brace expression with a "(" or a "[" - this.#appendToBraceExpression(this.curToken.text) - // https://peps.python.org/pep-0498/#lambdas-inside-expressions - this.#incrementBraceStack(); - break; - case PythonLexer.RPAR: - case PythonLexer.RSQB: - // append the current brace expression with a ")" or a "]" - this.#appendToBraceExpression(this.curToken.text) - this.#decrementBraceStack(); - break; - case PythonLexer.COLON: - case PythonLexer.COLONEQUAL: - // append the current brace expression with a ":" or a ":=" - this.#appendToBraceExpression(this.curToken.text) - this.#setLexerModeByCOLONorCOLONEQUALtoken(); - break; - case PythonLexer.RBRACE: - this.#setLexerModeAfterRBRACEtoken(); - break; - default: - // append the current brace expression with the current token text - this.#appendToBraceExpression(this.curToken.text) - } - } - - #appendToBraceExpression(text) { - this.braceExpressionStack[this.braceExpressionStack.length - 1] += text; - } - - #incrementBraceStack() { // increment the last element (peek() + 1) - this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1]++; - } - - #decrementBraceStack() { // decrement the last element (peek() - 1) - this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1]--; - } - - #setLexerModeAfterRBRACEtoken() { - switch (this.curLexerMode) { - case Lexer.DEFAULT_MODE: - this.#popLexerMode(); - this.#popByBRACE(); - break; - case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE: - this.#popLexerMode(); - this.#popLexerMode(); - this.#popByBRACE(); - break; - default: - this.#reportLexerError("f-string: single '}' is not allowed"); - } - } - - #setLexerModeByFSTRING_STARTtoken() { - const text = this.curToken.text.toLowerCase(); - const modeMap = { - "f'": PythonLexer.SQ1__FSTRING_MODE, - "rf'": PythonLexer.SQ1R_FSTRING_MODE, - "fr'": PythonLexer.SQ1R_FSTRING_MODE, - 'f"': PythonLexer.DQ1__FSTRING_MODE, - 'rf"': PythonLexer.DQ1R_FSTRING_MODE, - 'fr"': PythonLexer.DQ1R_FSTRING_MODE, - "f'''": PythonLexer.SQ3__FSTRING_MODE, - "rf'''": PythonLexer.SQ3R_FSTRING_MODE, - "fr'''": PythonLexer.SQ3R_FSTRING_MODE, - 'f"""': PythonLexer.DQ3__FSTRING_MODE, - 'rf"""': PythonLexer.DQ3R_FSTRING_MODE, - 'fr"""': PythonLexer.DQ3R_FSTRING_MODE, - }; - const mode = modeMap[text]; - if (mode !== undefined) { - this.#pushLexerMode(mode); - } - } - - #setLexerModeByCOLONorCOLONEQUALtoken() { - if (this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1] === 0) { // stack peek == 0 - const previousMode = this.lexerModeStack[this.lexerModeStack.length - 1]; // stack peek - switch (previousMode) { // check the previous lexer mode (the current is DEFAULT_MODE) - case PythonLexer.SQ1__FSTRING_MODE: - case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE: - this.#pushLexerMode(PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ1R_FSTRING_MODE: - case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE: - this.#pushLexerMode(PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ1__FSTRING_MODE: - case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE: - this.#pushLexerMode(PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ1R_FSTRING_MODE: - case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE: - this.#pushLexerMode(PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ3__FSTRING_MODE: - case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE: - this.#pushLexerMode(PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ3R_FSTRING_MODE: - case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE: - this.#pushLexerMode(PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ3__FSTRING_MODE: - case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE: - this.#pushLexerMode(PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ3R_FSTRING_MODE: - case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE: - this.#pushLexerMode(PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - } - } - } - - #popByBRACE() { - this.paren_or_bracket_openedStack.pop(); - this.prevBraceExpression = this.braceExpressionStack.pop() + "}"; - if (this.braceExpressionStack.length > 0) { - // append the current brace expression with the previous brace expression - this.braceExpressionStack[this.braceExpressionStack.length - 1] += this.prevBraceExpression; - } - } - - #handleFSTRING_MIDDLEtokenWithDoubleBrace() { - // replace the trailing double brace with a single brace and insert a hidden brace token - switch (this.#getLastTwoCharsOfTheCurTokenText()) { - case "{{": - this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL); - break; - case "}}": - this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL); - break; - } - } - - #handleFSTRING_MIDDLEtokenWithQuoteAndLBrace() { - // replace the trailing quote + left_brace with a quote and insert an LBRACE token - // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token - switch (this.#getLastTwoCharsOfTheCurTokenText()) { - case "\"{": - case "'{": - case "\\{": - this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL); - break; - } - } - - #getLastTwoCharsOfTheCurTokenText() { - return this.curToken.text.slice(-2); - } - - #trimLastCharAddPendingTokenSetCurToken(type, text, channel) { - // trim the last char and add the modified curToken to the pendingTokens stack - const tokenTextWithoutLastChar = this.curToken.text.slice(0, -1); - this.curToken.text = tokenTextWithoutLastChar; - this.curToken.stop -= 1; - this.#addPendingToken(this.curToken); - - this.#createNewCurToken(type, text, channel); // set curToken - } - - #handleCOLONEQUALtokenInFString() { - if (this.lexerModeStack.length > 0 && - this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1] === 0) { // stack peek == 0 - - // In fstring a colonequal (walrus operator) can only be used in parentheses - // Not in parentheses, replace COLONEQUAL token with COLON as format specifier - // and insert the equal symbol to the following FSTRING_MIDDLE token - this.curToken.type = PythonLexer.COLON; - this.curToken.text = ":"; - this.curToken.stop = this.curToken.start; - - if (this.ffgToken.type === PythonLexer.FSTRING_MIDDLE) { - this.ffgToken.text = "=" + this.ffgToken.text; - this.ffgToken.start -= 1; - this.ffgToken.column -= 1; - } else { - this.#addPendingToken(this.curToken); - this.#createNewCurToken(PythonLexer.FSTRING_MIDDLE, "=", Token.DEFAULT_CHANNEL); - } - } - this.#addPendingToken(this.curToken); - } - - #createNewCurToken(type, text, channel) { - const ctkn = this.curToken.clone(); - ctkn.type = type; - ctkn.text = text; - ctkn.channel = channel; - ctkn.column += 1; - ctkn.start += 1; - ctkn.stop = ctkn.start; - this.curToken = ctkn; - } - - #pushLexerMode(mode) { - this.pushMode(mode); - this.lexerModeStack.push(this.curLexerMode); - this.curLexerMode = mode; - } - - #popLexerMode() { - this.popMode(); - this.curLexerMode = this.lexerModeStack.pop(); - } - - #handleFORMAT_SPECIFICATION_MODE() { - if (this.lexerModeStack.length > 0 && - this.ffgToken.type === PythonLexer.RBRACE) { - - // insert an empty FSTRING_MIDDLE token instead of the missing format specification - switch (this.curToken.type) { - case PythonLexer.COLON: - this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken); - break; - case PythonLexer.RBRACE: - // only if the previous brace expression is not a dictionary comprehension or set comprehension - if (!this.#isDictionaryComprehensionOrSetComprehension(this.prevBraceExpression)) { - this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken); - } - break; - } - } - } - - #isDictionaryComprehensionOrSetComprehension(code) { - const inputStream = CharStreams.fromString(code); - const lexer = new PythonLexer(inputStream); - const tokenStream = new CommonTokenStream(lexer); - let parser = new PythonParser(tokenStream); - - // Disable error listeners to suppress console output - lexer.removeErrorListeners(); - parser.removeErrorListeners(); - - parser.dictcomp(); // Try parsing as dictionary comprehension - if (parser.syntaxErrorsCount === 0) - return true; - - parser = new PythonParser(tokenStream); - tokenStream.seek(0); - parser.removeErrorListeners(); - parser.setcomp(); // Try parsing as set comprehension - return parser.syntaxErrorsCount === 0; - } - - #insertTrailingTokens() { - switch (this.lastPendingTokenTypeFromDefaultChannel) { - case PythonLexer.NEWLINE: - case PythonLexer.DEDENT: - break; // no trailing NEWLINE token is needed - default: - // insert an extra trailing NEWLINE token that serves as the end of the last statement - this.#createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF - } - this.#insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed - } - - #handleEOFtoken() { - if (this.lastPendingTokenTypeFromDefaultChannel > 0) { - // there was a statement in the input (leading NEWLINE tokens are hidden) - this.#insertTrailingTokens(); - } - this.#addPendingToken(this.curToken); - } - - #hideAndAddPendingToken(ctkn) { - ctkn.channel = Token.HIDDEN_CHANNEL; - this.#addPendingToken(ctkn); - } - - #createAndAddPendingToken(type, channel, text, sampleToken) { - const ctkn = sampleToken.clone(); - ctkn.type = type; - ctkn.channel = channel; - ctkn.stop = sampleToken.start - 1; - ctkn.text = text == null ? - `<${this.getSymbolicNames()[type]}>` : - text; - - this.#addPendingToken(ctkn); - } - - #addPendingToken(tkn) { - // save the last pending token type because the _pendingTokens linked list can be empty by the nextToken() - this.previousPendingTokenType = tkn.type; - if (tkn.channel === Token.DEFAULT_CHANNEL) { - this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; - } - this.pendingTokens.push(tkn) /* .addLast(token) */; - } - - #getIndentationLength(indentText) { // the indentText may contain spaces, tabs or form feeds - const TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces - let length = 0; - for (let ch of indentText) { - switch (ch) { - case " ": - this.wasSpaceIndentation = true; - length += 1; - break; - case "\t": - this.wasTabIndentation = true; - length += TAB_LENGTH - (length % TAB_LENGTH); - break; - case "\f": // form feed - length = 0; - break; - } - } - - if (this.wasTabIndentation && this.wasSpaceIndentation) { - if (!this.wasIndentationMixedWithSpacesAndTabs) { - this.wasIndentationMixedWithSpacesAndTabs = true; - length = this.#INVALID_LENGTH; // only for the first inconsistent indent - } - } - return length; - } - - #reportLexerError(errMsg) { - this.getErrorListener().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.#ERR_TXT + errMsg, null); - } - - #reportError(errMsg) { - this.#reportLexerError(errMsg); - - // the ERRORTOKEN will raise an error in the parser - this.#createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.ffgToken); - } -} diff --git a/python/python3_13/Python3/PythonLexerBase.py b/python/python3_13/Python3/PythonLexerBase.py deleted file mode 100644 index d3272163a9..0000000000 --- a/python/python3_13/Python3/PythonLexerBase.py +++ /dev/null @@ -1,557 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2021 Robert Einhorn -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -# Project : Python Indent/Dedent handler for ANTLR4 grammars -# -# Developed by : Robert Einhorn - -from typing import TextIO, Optional, List, Deque -from antlr4 import InputStream, Lexer, Token -from antlr4.Token import CommonToken -import sys -import re - -class PythonLexerBase(Lexer): - def __init__(self, input: InputStream, output: TextIO = sys.stdout): - super().__init__(input, output) - - # A stack that keeps track of the indentation lengths - self.__indent_length_stack: List[int] - - # A list where tokens are waiting to be loaded into the token stream - self.__pending_tokens: Deque[CommonToken] - - # last pending token type - self.__previous_pending_token_type: int - self.__last_pending_token_type_from_default_channel: int - - # The amount of opened parentheses, square brackets or curly braces - self.__opened: int - # The amount of opened parentheses and square brackets in the current lexer mode - self.__paren_or_bracket_opened_stack: List[int] - # A stack that stores expression(s) between braces in fstring - self.__brace_expression_stack: List[str] - self.__prev_brace_expression: str - - # Instead of self._mode (self._mode is not implemented in each ANTLR4 runtime) - self.__cur_lexer_mode: int - # Instead of self._modeStack (self._modeStack is not implemented in each ANTLR4 runtime) - self.__lexer_mode_stack: List[int] - - self.__was_space_indentation: bool - self.__was_tab_indentation: bool - self.__was_indentation_mixed_with_spaces_and_tabs: bool - - self.__cur_token: CommonToken # current (under processing) token - self.__ffg_token: CommonToken # following (look ahead) token - - self.__INVALID_LENGTH: int = -1 - self.__ERR_TXT: str = " ERROR: " - - self.__init() - - def nextToken(self) -> CommonToken: # reading the input stream until a return EOF - self.__check_next_token() - return self.__pending_tokens.popleft() # add the queued token to the token stream - - def reset(self) -> None: - self.__init() - super().reset() - - def __init(self) -> None: - self.__indent_length_stack = [] - self.__pending_tokens = Deque() - self.__previous_pending_token_type = 0 - self.__last_pending_token_type_from_default_channel = 0 - self.__opened = 0 - self.__paren_or_bracket_opened_stack = [] - self.__brace_expression_stack = [] - self.__prev_brace_expression = "" - self.__cur_lexer_mode = 0 - self.__lexer_mode_stack = [] - self.__was_space_indentation = False - self.__was_tab_indentation = False - self.__was_indentation_mixed_with_spaces_and_tabs = False - self.__cur_token = None - self.__ffg_token = None - - def __check_next_token(self) -> None: - if self.__previous_pending_token_type == Token.EOF: - return - - if not self.__indent_length_stack: # We're at the first token - self.__insert_ENCODING_token() - self.__set_current_and_following_tokens() - self.__handle_start_of_input() - else: - self.__set_current_and_following_tokens() - - match self.__cur_token.type: - case self.NEWLINE: - self.__handle_NEWLINE_token() - case self.LPAR | self.LSQB | self.LBRACE: - self.__opened += 1 - self.__add_pending_token(self.__cur_token) - case self.RPAR | self.RSQB | self.RBRACE: - self.__opened -= 1 - self.__add_pending_token(self.__cur_token) - case self.FSTRING_MIDDLE: - self.__handle_FSTRING_MIDDLE_token_with_double_brace() # does not affect the opened field - self.__add_pending_token(self.__cur_token) - case self.COLONEQUAL: - self.__handle_COLONEQUAL_token_in_fstring() - case self.ERRORTOKEN: - self.__report_lexer_error("token recognition error at: '" + self.__cur_token.text + "'") - self.__add_pending_token(self.__cur_token) - case Token.EOF: - self.__handle_EOF_token() - case _: - self.__add_pending_token(self.__cur_token) - self.__handle_FORMAT_SPECIFICATION_MODE() - - def __set_current_and_following_tokens(self) -> None: - self.__cur_token = super().nextToken() if self.__ffg_token is None else \ - self.__ffg_token - - self.__check_cur_token() # ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)! - - self.__ffg_token = self.__cur_token if self.__cur_token.type == Token.EOF else \ - super().nextToken() - - def __insert_ENCODING_token(self) -> None: # https://peps.python.org/pep-0263/ - line_builder: list[str] = [] - encoding_name: str = "" - line_count: int = 0 - ws_comment_pattern: re.Pattern = re.compile(r"^[ \t\f]*(#.*)?$") - input_stream: InputStream = self.inputStream - size: int = input_stream.size - - input_stream.seek(0) - for i in range(size): - c: str = chr(input_stream.LA(i + 1)) - line_builder.append(c) - - if c == '\n' or i == size - 1: - line: str = ''.join(line_builder).replace("\r", "").replace("\n", "") - if ws_comment_pattern.match(line): # WS* + COMMENT? found - encoding_name = self.__get_encoding_name(line) - if encoding_name: - break # encoding found - else: - break # statement or backslash found (first line is not empty, not whitespace(s), not comment) - - line_count += 1 - if line_count >= 2: - break # check only the first two lines - line_builder = [] - - if not encoding_name: - encoding_name = "utf-8" # default Python source code encoding - - encoding_token: CommonToken = CommonToken((None, None), self.ENCODING, CommonToken.HIDDEN_CHANNEL, 0, 0) - encoding_token.text = encoding_name - encoding_token.line = 0 - encoding_token.column = -1 - self.__add_pending_token(encoding_token) - - def __get_encoding_name(self, comment_text: str) -> str: # https://peps.python.org/pep-0263/#defining-the-encoding - encoding_comment_pattern: str = r"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)" - match: Optional[re.Match] = re.search(encoding_comment_pattern, comment_text) - return match.group(1) if match else "" - - # initialize the _indent_length_stack - # hide the leading NEWLINE token(s) - # if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel - # insert a leading INDENT token if necessary - def __handle_start_of_input(self) -> None: - # initialize the stack with a default 0 indentation length - self.__indent_length_stack.append(0) # this will never be popped off - while self.__cur_token.type != Token.EOF: - if self.__cur_token.channel == Token.DEFAULT_CHANNEL: - if self.__cur_token.type == self.NEWLINE: - # all the NEWLINE tokens must be ignored before the first statement - self.__hide_and_add_pending_token(self.__cur_token) - else: # We're at the first statement - self.__insert_leading_indent_token() - return # continue the processing of the current token with __check_next_token() - else: - self.__add_pending_token(self.__cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token - self.__set_current_and_following_tokens() - # continue the processing of the EOF token with __check_next_token() - - def __insert_leading_indent_token(self) -> None: - if self.__previous_pending_token_type == self.WS: - prev_token: CommonToken = self.__pending_tokens[-1] # WS token - if self.__get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement - err_msg: str = "first statement indented" - self.__report_lexer_error(err_msg) - # insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__cur_token) - - def __handle_NEWLINE_token(self) -> None: - if self.__lexer_mode_stack: # not is_empty - self.__add_pending_token(self.__cur_token) - elif self.__opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token - self.__hide_and_add_pending_token(self.__cur_token) - else: - nl_token: CommonToken = self.__cur_token.clone() # save the current NEWLINE token - is_looking_ahead: bool = self.__ffg_token.type == self.WS - if is_looking_ahead: - self.__set_current_and_following_tokens() # set the next two tokens - - match self.__ffg_token.type: - case self.NEWLINE | self.COMMENT: - # We're before a blank line or a comment or type comment or a type ignore comment - self.__hide_and_add_pending_token(nl_token) # ignore the NEWLINE token - if is_looking_ahead: - self.__add_pending_token(self.__cur_token) # WS token - case _: - self.__add_pending_token(nl_token) - if is_looking_ahead: # We're on a whitespace(s) followed by a statement - indentation_length: int = 0 if self.__ffg_token.type == Token.EOF else \ - self.__get_indentation_length(self.__cur_token.text) - - if indentation_length != self.__INVALID_LENGTH: - self.__add_pending_token(self.__cur_token) # WS token - self.__insert_INDENT_or_DEDENT_token(indentation_length) # may insert INDENT token or DEDENT token(s) - else: - self.__report_error("inconsistent use of tabs and spaces in indentation") - else: # We're at a newline followed by a statement (there is no whitespace before the statement) - self.__insert_INDENT_or_DEDENT_token(0) # may insert DEDENT token(s) - - def __insert_INDENT_or_DEDENT_token(self, indent_length: int) -> None: - prev_indent_length: int = self.__indent_length_stack[-1] # stack peek - if indent_length > prev_indent_length: - self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token) - self.__indent_length_stack.append(indent_length) # stack push - else: - while indent_length < prev_indent_length: # more than 1 DEDENT token may be inserted to the token stream - self.__indent_length_stack.pop() - prev_indent_length = self.__indent_length_stack[-1] # stack peek - if indent_length <= prev_indent_length: - self.__create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token) - else: - self.__report_error("inconsistent dedent") - - def __check_cur_token(self) -> None: - match self.__cur_token.type: - case self.FSTRING_START: - self.__set_lexer_mode_by_FSTRING_START_token() - return - case self.FSTRING_MIDDLE: - self.__handle_FSTRING_MIDDLE_token_with_quote_and_lbrace() # affect the opened field - if self.__cur_token.type == self.FSTRING_MIDDLE: - return # No __cur_token exchange happened - case self.FSTRING_END: - self.__pop_lexer_mode() - return - case _: - if not self.__lexer_mode_stack: - return # Not in fstring mode - - match self.__cur_token.type: # the following tokens can only come from default mode (after an LBRACE in fstring) - case self.NEWLINE: - # append the current brace expression with the current newline - self.__append_to_brace_expression(self.__cur_token.text) - self.__cur_token.channel = Token.HIDDEN_CHANNEL - case self.LBRACE: - # the outermost brace expression cannot be a dictionary comprehension or a set comprehension - self.__brace_expression_stack.append("{") - self.__paren_or_bracket_opened_stack.append(0) # stack push - self.__push_lexer_mode(Lexer.DEFAULT_MODE) - case self.LPAR | self.LSQB: - # append the current brace expression with a "(" or a "[" - self.__append_to_brace_expression(self.__cur_token.text) - # https://peps.python.org/pep-0498/#lambdas-inside-expressions - self.__increment_brace_stack() - case self.RPAR | self.RSQB: - # append the current brace expression with a ")" or a "]" - self.__append_to_brace_expression(self.__cur_token.text) - self.__decrement_brace_stack() - case self.COLON | self.COLONEQUAL: - # append the current brace expression with a ":" or a ":=" - self.__append_to_brace_expression(self.__cur_token.text) - self.__set_lexer_mode_by_COLON_or_COLONEQUAL_token() - case self.RBRACE: - self.__set_lexer_mode_after_RBRACE_token() - case _: - # append the current brace expression with the current token text - self.__append_to_brace_expression(self.__cur_token.text) - - def __append_to_brace_expression(self, text: str) -> None: - self.__brace_expression_stack[-1] += text - - def __increment_brace_stack(self) -> None: # increment the last element (peek() + 1) - self.__paren_or_bracket_opened_stack[-1] += 1 - - def __decrement_brace_stack(self) -> None: # decrement the last element (peek() - 1) - self.__paren_or_bracket_opened_stack[-1] -= 1 - - def __set_lexer_mode_after_RBRACE_token(self) -> None: - match self.__cur_lexer_mode: - case Lexer.DEFAULT_MODE: - self.__pop_lexer_mode() # only once - self.__pop_by_RBRACE() - - case self.SQ1__FORMAT_SPECIFICATION_MODE \ - | self.SQ1R_FORMAT_SPECIFICATION_MODE \ - | self.DQ1__FORMAT_SPECIFICATION_MODE \ - | self.DQ1R_FORMAT_SPECIFICATION_MODE \ - | self.SQ3__FORMAT_SPECIFICATION_MODE \ - | self.SQ3R_FORMAT_SPECIFICATION_MODE \ - | self.DQ3__FORMAT_SPECIFICATION_MODE \ - | self.DQ3R_FORMAT_SPECIFICATION_MODE: - - self.__pop_lexer_mode() - self.__pop_lexer_mode() - self.__pop_by_RBRACE() - case _: - self.__report_lexer_error("f-string: single '}' is not allowed") - - def __set_lexer_mode_by_FSTRING_START_token(self) -> None: - text = self.__cur_token.text.lower() - mode_map = { - "f'": self.SQ1__FSTRING_MODE, - "rf'": self.SQ1R_FSTRING_MODE, - "fr'": self.SQ1R_FSTRING_MODE, - 'f"': self.DQ1__FSTRING_MODE, - 'rf"': self.DQ1R_FSTRING_MODE, - 'fr"': self.DQ1R_FSTRING_MODE, - "f'''": self.SQ3__FSTRING_MODE, - "rf'''": self.SQ3R_FSTRING_MODE, - "fr'''": self.SQ3R_FSTRING_MODE, - 'f"""': self.DQ3__FSTRING_MODE, - 'rf"""': self.DQ3R_FSTRING_MODE, - 'fr"""': self.DQ3R_FSTRING_MODE, - } - mode = mode_map.get(text) - if mode is not None: - self.__push_lexer_mode(mode) - - def __set_lexer_mode_by_COLON_or_COLONEQUAL_token(self) -> None: - if self.__paren_or_bracket_opened_stack[-1] == 0: # stack peek == 0 - # COLONEQUAL token will be replaced with a COLON token in checkNextToken() - match self.__lexer_mode_stack[-1]: # check the previous lexer mode (the current is DEFAULT_MODE) - case self.SQ1__FSTRING_MODE \ - | self.SQ1__FORMAT_SPECIFICATION_MODE: - - self.__push_lexer_mode(self.SQ1__FORMAT_SPECIFICATION_MODE) # continue in format spec. mode - case self.SQ1R_FSTRING_MODE \ - | self.SQ1R_FORMAT_SPECIFICATION_MODE: - - self.__push_lexer_mode(self.SQ1R_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode - case self.DQ1__FSTRING_MODE \ - | self.DQ1__FORMAT_SPECIFICATION_MODE: - - self.__push_lexer_mode(self.DQ1__FORMAT_SPECIFICATION_MODE) # continue in format spec. mode - case self.DQ1R_FSTRING_MODE \ - | self.DQ1R_FORMAT_SPECIFICATION_MODE: - - self.__push_lexer_mode(self.DQ1R_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode - case self.SQ3__FSTRING_MODE \ - | self.SQ3__FORMAT_SPECIFICATION_MODE: - - self.__push_lexer_mode(self.SQ3__FORMAT_SPECIFICATION_MODE) # continue in format spec. mode - case self.SQ3R_FSTRING_MODE \ - | self.SQ3R_FORMAT_SPECIFICATION_MODE: - - self.__push_lexer_mode(self.SQ3R_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode - case self.DQ3__FSTRING_MODE \ - | self.DQ3__FORMAT_SPECIFICATION_MODE: - - self.__push_lexer_mode(self.DQ3__FORMAT_SPECIFICATION_MODE) # continue in format spec. mode - case self.DQ3R_FSTRING_MODE \ - | self.DQ3R_FORMAT_SPECIFICATION_MODE: - - self.__push_lexer_mode(self.DQ3R_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode - - def __pop_by_RBRACE(self) -> None: - self.__paren_or_bracket_opened_stack.pop() - self.__prev_brace_expression = self.__brace_expression_stack.pop() + "}" - if self.__brace_expression_stack: - # append the current brace expression with the previous brace expression - self.__brace_expression_stack[-1] += self.__prev_brace_expression - - def __handle_FSTRING_MIDDLE_token_with_double_brace(self) -> None: - # replace the trailing double brace with a single brace and insert a hidden brace token - match self.__get_last_two_chars_of_the_cur_token_text(): - case "{{": - self.__trim_last_char_add_pending_token_set_cur_token(self.LBRACE, "{", Token.HIDDEN_CHANNEL) - case "}}": - self.__trim_last_char_add_pending_token_set_cur_token(self.RBRACE, "}", Token.HIDDEN_CHANNEL) - - def __handle_FSTRING_MIDDLE_token_with_quote_and_lbrace(self) -> None: - # replace the trailing quote + left_brace with a quote and insert an LBRACE token - # replace the trailing backslash + left_brace with a backslash and insert an LBRACE token - match self.__get_last_two_chars_of_the_cur_token_text(): - case "\"{" | "'{" | "\\{": - self.__trim_last_char_add_pending_token_set_cur_token(self.LBRACE, "{", Token.DEFAULT_CHANNEL) - - def __get_last_two_chars_of_the_cur_token_text(self) -> str: - cur_token_text: str = self.__cur_token.text - return cur_token_text[-2:] if len(cur_token_text) >= 2 else cur_token_text - - def __trim_last_char_add_pending_token_set_cur_token(self, type: int, text: str, channel: int) -> None: - # trim the last char and add the modified curToken to the __pending_tokens stack - token_text_without_lbrace: str = self.__cur_token.text[:-1] - self.__cur_token.text = token_text_without_lbrace - self.__cur_token.stop -= 1 - self.__add_pending_token(self.__cur_token) - - self.__create_new_cur_token(type, text, channel) # set __cur_token - - def __handle_COLONEQUAL_token_in_fstring(self) -> None: - if self.__lexer_mode_stack \ - and self.__paren_or_bracket_opened_stack[-1] == 0: # stack peek == 0 - - # In fstring a colonequal (walrus operator) can only be used in parentheses - # Not in parentheses, replace COLONEQUAL token with COLON as format specifier - # and insert the equal symbol to the following FSTRING_MIDDLE token - self.__cur_token.type = self.COLON - self.__cur_token.text = ":" - self.__cur_token.stop = self.__cur_token.start - if self.__ffg_token.type == self.FSTRING_MIDDLE: - self.__ffg_token.text = "=" + self.__ffg_token.text - self.__ffg_token.start -= 1 - self.__ffg_token.column -= 1 - else: - self.__add_pending_token(self.__cur_token) - self.__create_new_current_token(self.FSTRING_MIDDLE, "=", Token.DEFAULT_CHANNEL) - self.__add_pending_token(self.__cur_token) - - def __create_new_cur_token(self, type: int, text: str, channel: int) -> None: - ctkn: CommonToken = self.__cur_token.clone() - ctkn.type = type - ctkn.text = text - ctkn.channel = channel - ctkn.column += 1 - ctkn.start += 1 - ctkn.stop = ctkn.start - self.__cur_token = ctkn - - def __push_lexer_mode(self, mode: int) -> None: - self.pushMode(mode) - self.__lexer_mode_stack.append(self.__cur_lexer_mode) # stack push - self.__cur_lexer_mode = mode - - def __pop_lexer_mode(self) -> None: - self.popMode() - self.__cur_lexer_mode = self.__lexer_mode_stack.pop() - - def __handle_FORMAT_SPECIFICATION_MODE(self) -> None: - if self.__lexer_mode_stack \ - and self.__ffg_token.type == self.RBRACE: - - # insert an empty FSTRING_MIDDLE token instead of the missing format specification - match self.__cur_token.type: - case self.COLON: - self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", self.__ffg_token) - case self.RBRACE: - # only if the previous brace expression is not a dictionary comprehension or set comprehension - if not self.__is_dictionary_comprehension_or_set_comprehension(self.__prev_brace_expression): - self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", self.__ffg_token) - - def __is_dictionary_comprehension_or_set_comprehension(self, code: str) -> bool: - from antlr4 import InputStream, CommonTokenStream - from PythonLexer import PythonLexer - from PythonParser import PythonParser - - input_stream: InputStream = InputStream(code) - lexer: PythonLexer = PythonLexer(input_stream) - token_stream: CommonTokenStream = CommonTokenStream(lexer) - parser: PythonParser = PythonParser(token_stream) - - # Disable error listeners to suppress console output - lexer.removeErrorListeners() - parser.removeErrorListeners() - - parser.dictcomp() # Try parsing as dictionary comprehension - if parser.getNumberOfSyntaxErrors() == 0: - return True - - parser = PythonParser(token_stream) - token_stream.seek(0) - parser.removeErrorListeners() - parser.setcomp() # Try parsing as set comprehension - return parser.getNumberOfSyntaxErrors() == 0 - - def __insert_trailing_tokens(self) -> None: - match self.__last_pending_token_type_from_default_channel: - case self.NEWLINE | self.DEDENT: - pass # no trailing NEWLINE token is needed - case _: # insert an extra trailing NEWLINE token that serves as the end of the last statement - self.__create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.__ffg_token) # _ffg_token is EOF - self.__insert_INDENT_or_DEDENT_token(0) # Now insert as much trailing DEDENT tokens as needed - - def __handle_EOF_token(self) -> None: - if self.__last_pending_token_type_from_default_channel > 0: - # there was statement in the input (leading NEWLINE tokens are hidden) - self.__insert_trailing_tokens() - self.__add_pending_token(self.__cur_token) - - def __hide_and_add_pending_token(self, ctkn: CommonToken) -> None: - ctkn.channel = Token.HIDDEN_CHANNEL - self.__add_pending_token(ctkn) - - def __create_and_add_pending_token(self, ttype: int, channel: int, text: Optional[str], sample_token: CommonToken) -> None: - ctkn: CommonToken = sample_token.clone() - ctkn.type = ttype - ctkn.channel = channel - ctkn.stop = sample_token.start - 1 - ctkn.text = "<" + self.symbolicNames[ttype] + ">" if text is None else \ - text - - self.__add_pending_token(ctkn) - - def __add_pending_token(self, ctkn: CommonToken) -> None: - # save the last pending token type because the _pending_tokens list can be empty by the nextToken() - self.__previous_pending_token_type = ctkn.type - if ctkn.channel == Token.DEFAULT_CHANNEL: - self.__last_pending_token_type_from_default_channel = self.__previous_pending_token_type - self.__pending_tokens.append(ctkn) - - def __get_indentation_length(self, indentText: str) -> int: # the indentText may contain spaces, tabs or form feeds - TAB_LENGTH: int = 8 # the standard number of spaces to replace a tab with spaces - length: int = 0 - ch: str - for ch in indentText: - match ch: - case ' ': - self.__was_space_indentation = True - length += 1 - case '\t': - self.__was_tab_indentation = True - length += TAB_LENGTH - (length % TAB_LENGTH) - case '\f': # form feed - length = 0 - - if self.__was_tab_indentation and self.__was_space_indentation: - if not self.__was_indentation_mixed_with_spaces_and_tabs: - self.__was_indentation_mixed_with_spaces_and_tabs = True - length = self.__INVALID_LENGTH # only for the first inconsistent indent - return length - - def __report_lexer_error(self, err_msg: str) -> None: - self.getErrorListenerDispatch().syntaxError(self, self.__cur_token, self.__cur_token.line, self.__cur_token.column, " LEXER" + self.__ERR_TXT + err_msg, None) - - def __report_error(self, err_msg: str) -> None: - self.__report_lexer_error(err_msg) - - # the ERRORTOKEN will raise an error in the parser - self.__create_and_add_pending_token(self.ERRORTOKEN, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__ffg_token) diff --git a/python/python3_13/README.md b/python/python3_13/README.md deleted file mode 100644 index 3f02d91e4f..0000000000 --- a/python/python3_13/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Python 3.13.2 parser - -### About files: -- PythonParser.g4 is the ANTLR4 parser grammar that based on the official [Python PEG grammar](https://docs.python.org/3.13/reference/grammar.html) - -- PythonLexerBase class - - handles the Python indentations - - creates encoding token - - tokenizes fstring literals - - and manage many other things - -- Example files from: [Python 3.13 Standard Lib](https://github.com/python/cpython/tree/3.13/Lib)

- -### Recent changes: -- parser grammar update for Python 3.13.2 -- added ENCODING token -- complete rewrite of fstring tokenizer in lexer grammar and PythonLexerBase class - - now correctly tokenizes the followings in fstring: - - escape sequences - - walrus operator - - dictionary comprehension - - set comprehension -- soft keywords changes: - - no embedded code (semantic predicates) in parser grammar for soft keywords - - no need for PythonParserBase class - - no need for transformGrammar.py - - **BREAKING CHANGES**: - - dedicated tokens for soft keywords instead of NAME token: - - NAME_OR_TYPE - - NAME_OR_MATCH - - NAME_OR_CASE - - NAME_OR_WILDCARD - -#### [Previous changes](https://github.com/antlr/grammars-v4/tree/master/python/python3_13)

- -### Related link: -[ANTLR4-parser-for-Python-3.13](https://github.com/RobEin/ANTLR4-parser-for-Python-3.13) \ No newline at end of file diff --git a/python/python3_13/TypeScript/PythonLexerBase.ts b/python/python3_13/TypeScript/PythonLexerBase.ts deleted file mode 100644 index 5ba9b2062e..0000000000 --- a/python/python3_13/TypeScript/PythonLexerBase.ts +++ /dev/null @@ -1,677 +0,0 @@ -/* -The MIT License (MIT) -Copyright (c) 2021 Robert Einhorn - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. - */ - -/* - * - * Project : Python Indent/Dedent handler for ANTLR4 grammars - * - * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com - * - */ - -import { CharStream, CharStreams, CommonTokenStream, Token, CommonToken, Lexer, TokenStream } from "antlr4"; -import PythonLexer from "./PythonLexer"; -import PythonParser from "./PythonParser"; -import * as Collections from "typescript-collections"; - -export default abstract class PythonLexerBase extends Lexer { - // A stack that keeps track of the indentation lengths - private indentLengthStack!: Collections.Stack; - // A list where tokens are waiting to be loaded into the token stream - private pendingTokens!: Array; - - // last pending token types - private previousPendingTokenType!: number; - private lastPendingTokenTypeFromDefaultChannel!: number; - - // The amount of opened parentheses, square brackets or curly braces - private opened!: number; - // The amount of opened parentheses and square brackets in the current lexer mode - private paren_or_bracket_openedStack!: Array; - // A stack that stores expression(s) between braces in fstring - private braceExpressionStack!: Array; - private prevBraceExpression!: string; - - // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime) - private curLexerMode!: number; - // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime) - private lexerModeStack!: Array; - - private wasSpaceIndentation!: boolean; - private wasTabIndentation!: boolean; - private wasIndentationMixedWithSpacesAndTabs!: boolean; - - private curToken: Token | undefined; // current (under processing) token - private ffgToken: Token | undefined; // following (look ahead) token - - private readonly INVALID_LENGTH: number = -1; - private readonly ERR_TXT: string = " ERROR: "; - - protected constructor(input: CharStream) { - super(input); - this.init(); - } - - public nextToken(): Token { // reading the input stream until a return EOF - this.checkNextToken(); - return this.pendingTokens.shift()! /* .pollFirst() */; // add the queued token to the token stream - } - - public reset(): void { - this.init(); - super.reset(); - } - - private init(): void { - this.indentLengthStack = new Collections.Stack(); - this.pendingTokens = []; - this.previousPendingTokenType = 0; - this.lastPendingTokenTypeFromDefaultChannel = 0; - this.opened = 0; - this.paren_or_bracket_openedStack = []; - this.braceExpressionStack = []; - this.prevBraceExpression = ""; - this.curLexerMode = 0; - this.lexerModeStack = []; - this.wasSpaceIndentation = false; - this.wasTabIndentation = false; - this.wasIndentationMixedWithSpacesAndTabs = false; - this.curToken = undefined; - this.ffgToken = undefined; - } - - private checkNextToken(): void { - if (this.previousPendingTokenType == PythonLexer.EOF) - return; - - if (this.indentLengthStack.isEmpty()) { // We're at the first token - this.insertENCODINGtoken(); - this.setCurrentAndFollowingTokens(); - this.handleStartOfInput(); - } else { - this.setCurrentAndFollowingTokens(); - } - - switch (this.curToken!.type) { - case PythonLexer.NEWLINE: - this.handleNEWLINEtoken(); - break; - case PythonLexer.LPAR: - case PythonLexer.LSQB: - case PythonLexer.LBRACE: - this.opened++; - this.addPendingToken(this.curToken!); - break; - case PythonLexer.RPAR: - case PythonLexer.RSQB: - case PythonLexer.RBRACE: - this.opened--; - this.addPendingToken(this.curToken!); - break; - case PythonLexer.FSTRING_MIDDLE: - this.handleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field - this.addPendingToken(this.curToken!); - break; - case PythonLexer.COLONEQUAL: - this.handleCOLONEQUALtokenInFString(); - break; - case PythonLexer.ERRORTOKEN: - this.reportLexerError(`token recognition error at: '${this.curToken!.text}'`); - this.addPendingToken(this.curToken!); - break; - case PythonLexer.EOF: - this.handleEOFtoken(); - break; - default: - this.addPendingToken(this.curToken!); - } - this.handleFORMAT_SPECIFICATION_MODE(); - } - - private setCurrentAndFollowingTokens(): void { - this.curToken = this.ffgToken == undefined - ? super.nextToken() - : this.ffgToken; - - this.checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)! - - this.ffgToken = this.curToken.type === PythonLexer.EOF - ? this.curToken - : super.nextToken(); - } - - private insertENCODINGtoken(): void { // https://peps.python.org/pep-0263/ - let lineBuilder: string = ''; - let encodingName: string = ''; - let lineCount: number = 0; - const ws_commentPattern: RegExp = /^[ \t\f]*(#.*)?$/; - const charStream: CharStream = this._input; - const size: number = charStream.size; - - charStream.seek(0); - for (let i = 0; i < size; i++) { - const c: string = String.fromCharCode(charStream.LA(i + 1)); - lineBuilder += c; - - if (c === '\n' || i === size - 1) { - const line: string = lineBuilder.replace(/\r/g, '').replace(/\n/g, ''); - if (ws_commentPattern.test(line)) { // WS* + COMMENT? found - encodingName = this.getEncodingName(line); - if (encodingName !== '') { - break; // encoding found - } - } else { - break; // statement or backslash found (line is not empty, not whitespace(s), not comment) - } - - lineCount++; - if (lineCount >= 2) { - break; // check only the first two lines - } - lineBuilder = ''; - } - } - - if (encodingName === '') { - encodingName = 'utf-8'; // default Python source code encoding - } - - const encodingToken = new CommonToken([this, this._input], PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, 0, 0); - encodingToken.text = encodingName; - encodingToken.line = 0; - encodingToken.column = -1; - this.addPendingToken(encodingToken); - } - - private getEncodingName(commentText: string): string { // https://peps.python.org/pep-0263/#defining-the-encoding - const encodingCommentPattern: RegExp = /^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)/; - const match: RegExpMatchArray | null = commentText.match(encodingCommentPattern); - return match ? match[1] : ''; - } - - // initialize the indentLengthStack - // hide the leading NEWLINE token(s) - // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel - // insert a leading INDENT token if necessary - private handleStartOfInput(): void { - // initialize the stack with a default 0 indentation length - this.indentLengthStack.push(0); // this will never be popped off - while (this.curToken!.type !== PythonLexer.EOF) { - if (this.curToken!.channel === Token.DEFAULT_CHANNEL) { - if (this.curToken!.type === PythonLexer.NEWLINE) { - // all the NEWLINE tokens must be ignored before the first statement - this.hideAndAddPendingToken(this.curToken!); - } else { // We're at the first statement - this.insertLeadingIndentToken(); - return; // continue the processing of the current token with checkNextToken() - } - } else { - this.addPendingToken(this.curToken!); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token - } - this.setCurrentAndFollowingTokens(); - } // continue the processing of the EOF token with checkNextToken() - } - - private insertLeadingIndentToken(): void { - if (this.previousPendingTokenType === PythonLexer.WS) { - const prevToken: Token = this.pendingTokens.at(-1)!; /* .peekLast() */ // WS token - if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement - const errMsg: string = "first statement indented"; - this.reportLexerError(errMsg); - // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken!); - } - } - } - - private handleNEWLINEtoken(): void { - if (this.lexerModeStack.length > 0) { - this.addPendingToken(this.curToken!); - } else if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token - this.hideAndAddPendingToken(this.curToken!); - } else { - const nlToken: Token = this.curToken?.clone()!; // save the current NEWLINE token - const isLookingAhead: boolean = this.ffgToken!.type === PythonLexer.WS; - if (isLookingAhead) { - this.setCurrentAndFollowingTokens(); // set the next two tokens - } - - switch (this.ffgToken!.type) { - case PythonLexer.NEWLINE: // We're before a blank line - case PythonLexer.COMMENT: // We're before a comment - this.hideAndAddPendingToken(nlToken); - if (isLookingAhead) { - this.addPendingToken(this.curToken!); // WS token - } - break; - default: - this.addPendingToken(nlToken); - if (isLookingAhead) { // We're on whitespace(s) followed by a statement - const indentationLength: number = this.ffgToken!.type === PythonLexer.EOF ? - 0 : - this.getIndentationLength(this.curToken!.text); - - if (indentationLength !== this.INVALID_LENGTH) { - this.addPendingToken(this.curToken!); // WS token - this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) - } else { - this.reportError("inconsistent use of tabs and spaces in indentation"); - } - } else { // We're at a newline followed by a statement (there is no whitespace before the statement) - this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) - } - } - } - } - - private insertIndentOrDedentToken(indentLength: number): void { - let prevIndentLength: number = this.indentLengthStack.peek()!; - if (indentLength > prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!); - this.indentLengthStack.push(indentLength); - } else { - while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream - this.indentLengthStack.pop(); - prevIndentLength = this.indentLengthStack.peek()!; - if (indentLength <= prevIndentLength) { - this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!); - } else { - this.reportError("inconsistent dedent"); - } - } - } - } - - private checkCurToken(): void { - switch (this.curToken!.type) { - case PythonLexer.FSTRING_START: - this.setLexerModeByFSTRING_STARTtoken(); - return; - case PythonLexer.FSTRING_MIDDLE: - this.handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field - if (this.curToken!.type === PythonLexer.FSTRING_MIDDLE) { - return; - } - break; - case PythonLexer.FSTRING_END: - this.popLexerMode(); - return; - default: - if (this.lexerModeStack.length === 0) { - return; - } - } - - switch (this.curToken!.type) { // the following tokens can only come from default mode (after an LBRACE in fstring) - case PythonLexer.NEWLINE: - // append the current brace expression with the current newline - this.appendToBraceExpression(this.curToken!.text); - this.curToken!.channel = Token.HIDDEN_CHANNEL; - break; - case PythonLexer.LBRACE: - // the outermost brace expression cannot be a dictionary comprehension or a set comprehension - this.braceExpressionStack.push("{"); - this.paren_or_bracket_openedStack.push(0); - this.pushLexerMode(Lexer.DEFAULT_MODE); - break; - case PythonLexer.LPAR: - case PythonLexer.LSQB: - // append the current brace expression with a "(" or a "[" - this.appendToBraceExpression(this.curToken!.text); - // https://peps.python.org/pep-0498/#lambdas-inside-expressions - this.incrementBraceStack(); - break; - case PythonLexer.RPAR: - case PythonLexer.RSQB: - // append the current brace expression with a ")" or a "]" - this.appendToBraceExpression(this.curToken!.text); - this.decrementBraceStack(); - break; - case PythonLexer.COLON: - case PythonLexer.COLONEQUAL: - // append the current brace expression with a ":" or a ":=" - this.appendToBraceExpression(this.curToken!.text); - this.setLexerModeByCOLONorCOLONEQUALtoken(); - break; - case PythonLexer.RBRACE: - this.setLexerModeAfterRBRACEtoken(); - break; - default: - // append the current brace expression with the current token text - this.appendToBraceExpression(this.curToken!.text); - } - } - - private appendToBraceExpression(text: string): void { - this.braceExpressionStack[this.braceExpressionStack.length - 1] += text; - } - - private incrementBraceStack(): void { // increment the last element (peek() + 1) - this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1]++; - } - - private decrementBraceStack(): void { // decrement the last element (peek() - 1) - this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1]--; - } - - private setLexerModeAfterRBRACEtoken(): void { - switch (this.curLexerMode) { - case Lexer.DEFAULT_MODE: - this.popLexerMode(); - this.popByBRACE(); - break; - case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE: - case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE: - case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE: - this.popLexerMode(); - this.popLexerMode(); - this.popByBRACE(); - break; - default: - this.reportLexerError("f-string: single '}' is not allowed"); - } - } - - private setLexerModeByFSTRING_STARTtoken(): void { - const text = this.curToken!.text.toLowerCase(); - const modeMap: { [key: string]: number } = { - "f'": PythonLexer.SQ1__FSTRING_MODE, - "rf'": PythonLexer.SQ1R_FSTRING_MODE, - "fr'": PythonLexer.SQ1R_FSTRING_MODE, - 'f"': PythonLexer.DQ1__FSTRING_MODE, - 'rf"': PythonLexer.DQ1R_FSTRING_MODE, - 'fr"': PythonLexer.DQ1R_FSTRING_MODE, - "f'''": PythonLexer.SQ3__FSTRING_MODE, - "rf'''": PythonLexer.SQ3R_FSTRING_MODE, - "fr'''": PythonLexer.SQ3R_FSTRING_MODE, - 'f"""': PythonLexer.DQ3__FSTRING_MODE, - 'rf"""': PythonLexer.DQ3R_FSTRING_MODE, - 'fr"""': PythonLexer.DQ3R_FSTRING_MODE, - }; - const mode = modeMap[text]; - if (mode !== undefined) { - this.pushLexerMode(mode); - } - } - - private setLexerModeByCOLONorCOLONEQUALtoken(): void { - if (this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1] === 0) { // stack peek == 0 - const previousMode = this.lexerModeStack[this.lexerModeStack.length - 1]; // stack peek - switch (previousMode) { // check the previous lexer mode (the current is DEFAULT_MODE) - case PythonLexer.SQ1__FSTRING_MODE: - case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ1R_FSTRING_MODE: - case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ1__FSTRING_MODE: - case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ1R_FSTRING_MODE: - case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ3__FSTRING_MODE: - case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.SQ3R_FSTRING_MODE: - case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ3__FSTRING_MODE: - case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - case PythonLexer.DQ3R_FSTRING_MODE: - case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE: - this.pushLexerMode(PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode - break; - } - } - } - - private popByBRACE(): void { - this.paren_or_bracket_openedStack.pop(); - this.prevBraceExpression = this.braceExpressionStack.pop() + "}"; - if (this.braceExpressionStack.length > 0) { - // append the current brace expression with the previous brace expression - this.braceExpressionStack[this.braceExpressionStack.length - 1] += this.prevBraceExpression; - } - } - - private handleFSTRING_MIDDLEtokenWithDoubleBrace(): void { - // Replace the trailing double brace with a single brace and insert a hidden brace token - switch (this.getLastTwoCharsOfTheCurTokenText()) { - case "{{": - this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL); - break; - case "}}": - this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL); - break; - } - } - - private handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(): void { - // Replace the trailing quote + left_brace with a quote and insert an LBRACE token - // Replace the trailing backslash + left_brace with a backslash and insert an LBRACE token - switch (this.getLastTwoCharsOfTheCurTokenText()) { - case "\"{": - case "'{": - case "\\{": - this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL); - break; - } - } - - private getLastTwoCharsOfTheCurTokenText(): string { - return this.curToken!.text.slice(-2); - } - - private trimLastCharAddPendingTokenSetCurToken(type: number, text: string, channel: number): void { - // Trim the last char and add the modified curToken to the pendingTokens stack - const tokenTextWithoutLastChar = this.curToken!.text.slice(0, -1); - this.curToken!.text = tokenTextWithoutLastChar; - this.curToken!.stop -= 1; - this.addPendingToken(this.curToken!); - - this.createNewCurToken(type, text, channel); // Set curToken - } - - private handleCOLONEQUALtokenInFString(): void { - if ( - this.lexerModeStack.length > 0 && - this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1] === 0 // stack peek == 0 - ) { - // In fstring, a colonequal (walrus operator) can only be used in parentheses - // Not in parentheses, replace COLONEQUAL token with COLON as format specifier - // and insert the equal symbol to the following FSTRING_MIDDLE token - this.curToken!.type = PythonLexer.COLON; - this.curToken!.text = ":"; - this.curToken!.stop = this.curToken!.start; - - if (this.ffgToken!.type === PythonLexer.FSTRING_MIDDLE) { - this.ffgToken!.text = "=" + this.ffgToken!.text; - this.ffgToken!.start -= 1; - this.ffgToken!.column -= 1; - } else { - this.addPendingToken(this.curToken!); - this.createNewCurToken(PythonLexer.FSTRING_MIDDLE, "=", Token.DEFAULT_CHANNEL); - } - } - this.addPendingToken(this.curToken!); - } - - private createNewCurToken(type: number, text: string, channel: number): void { - const ctkn = this.curToken!.clone(); - ctkn.type = type; - ctkn.text = text; - ctkn.channel = channel; - ctkn.column += 1; - ctkn.start += 1; - ctkn.stop = ctkn.start; - this.curToken = ctkn; - } - - private pushLexerMode(mode: number): void { - this.pushMode(mode); - this.lexerModeStack.push(this.curLexerMode); - this.curLexerMode = mode; - } - - private popLexerMode(): void { - this.popMode(); - this.curLexerMode = this.lexerModeStack.pop()!; - } - - private handleFORMAT_SPECIFICATION_MODE() { - if (this.lexerModeStack.length > 0 && - this.ffgToken!.type === PythonLexer.RBRACE) { - - // insert an empty FSTRING_MIDDLE token instead of the missing format specification - switch (this.curToken!.type) { - case PythonLexer.COLON: - this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken!); - break; - case PythonLexer.RBRACE: - // only if the previous brace expression is not a dictionary comprehension or set comprehension - if (!this.isDictionaryComprehensionOrSetComprehension(this.prevBraceExpression)) { - this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken!); - } - break; - } - } - } - - private isDictionaryComprehensionOrSetComprehension(code: string): boolean { - const inputStream: CharStream = CharStreams.fromString(code); - const lexer = new PythonLexer(inputStream); - const tokenStream = new CommonTokenStream(lexer); - let parser = new PythonParser(tokenStream); - - // Disable error listeners to suppress console output - lexer.removeErrorListeners(); - parser.removeErrorListeners(); - - parser.dictcomp(); // Try parsing as dictionary comprehension - if (parser.syntaxErrorsCount === 0) - return true; - - parser = new PythonParser(tokenStream); - (tokenStream as any).seek(0); // seek method is not declared in CommonTokenStream.d.ts - parser.removeErrorListeners(); - parser.setcomp(); // Try parsing as set comprehension - return parser.syntaxErrorsCount === 0; - } - - private insertTrailingTokens(): void { - switch (this.lastPendingTokenTypeFromDefaultChannel) { - case PythonLexer.NEWLINE: - case PythonLexer.DEDENT: - break; // no trailing NEWLINE token is needed - default: - // insert an extra trailing NEWLINE token that serves as the end of the last statement - this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken!); // ffgToken is EOF - } - this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed - } - - private handleEOFtoken(): void { - if (this.lastPendingTokenTypeFromDefaultChannel > 0) { - // there was a statement in the input (leading NEWLINE tokens are hidden) - this.insertTrailingTokens(); - } - this.addPendingToken(this.curToken!); - } - - private hideAndAddPendingToken(tkn: Token): void { - tkn.channel = Token.HIDDEN_CHANNEL; - this.addPendingToken(tkn); - } - - private createAndAddPendingToken(type: number, channel: number, text: string | null, sampleToken: Token): void { - const tkn: Token = sampleToken.clone(); - tkn.type = type; - tkn.channel = channel; - tkn.stop = sampleToken.start - 1; - tkn.text = text == null ? - `<${this.getSymbolicNames()[type]}>` : - text; - - this.addPendingToken(tkn); - } - - private addPendingToken(tkn: Token): void { - // save the last pending token type because the pendingTokens list can be empty by the nextToken() - this.previousPendingTokenType = tkn.type; - if (tkn.channel === Token.DEFAULT_CHANNEL) { - this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; - } - this.pendingTokens.push(tkn) /* .addLast(token) */; - } - - private getIndentationLength(indentText: string): number { // the indentText may contain spaces, tabs or form feeds - const TAB_LENGTH: number = 8; // the standard number of spaces to replace a tab to spaces - let length: number = 0; - for (let ch of indentText) { - switch (ch) { - case " ": - this.wasSpaceIndentation = true; - length += 1; - break; - case "\t": - this.wasTabIndentation = true; - length += TAB_LENGTH - (length % TAB_LENGTH); - break; - case "\f": // form feed - length = 0; - break; - } - } - - if (this.wasTabIndentation && this.wasSpaceIndentation) { - if (!this.wasIndentationMixedWithSpacesAndTabs) { - this.wasIndentationMixedWithSpacesAndTabs = true; - length = this.INVALID_LENGTH; // only for the first inconsistent indent - } - } - return length; - } - - private reportLexerError(errMsg: string): void { - this.getErrorListener().syntaxError(this, 0 /* this.curToken */, this.curToken!.line, this.curToken!.column, " LEXER" + this.ERR_TXT + errMsg, undefined); - } - - private reportError(errMsg: string): void { - this.reportLexerError(errMsg); - - // the ERRORTOKEN will raise an error in the parser - this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken!); - } -} diff --git a/python/python3_13/examples/_colorize.py b/python/python3_13/examples/_colorize.py deleted file mode 100644 index 845fb57a90..0000000000 --- a/python/python3_13/examples/_colorize.py +++ /dev/null @@ -1,64 +0,0 @@ -import io -import os -import sys - -COLORIZE = True - - -class ANSIColors: - BOLD_GREEN = "\x1b[1;32m" - BOLD_MAGENTA = "\x1b[1;35m" - BOLD_RED = "\x1b[1;31m" - GREEN = "\x1b[32m" - GREY = "\x1b[90m" - MAGENTA = "\x1b[35m" - RED = "\x1b[31m" - RESET = "\x1b[0m" - YELLOW = "\x1b[33m" - - -NoColors = ANSIColors() - -for attr in dir(NoColors): - if not attr.startswith("__"): - setattr(NoColors, attr, "") - - -def get_colors(colorize: bool = False) -> ANSIColors: - if colorize or can_colorize(): - return ANSIColors() - else: - return NoColors - - -def can_colorize() -> bool: - if sys.platform == "win32": - try: - import nt - - if not nt._supports_virtual_terminal(): - return False - except (ImportError, AttributeError): - return False - if not sys.flags.ignore_environment: - if os.environ.get("PYTHON_COLORS") == "0": - return False - if os.environ.get("PYTHON_COLORS") == "1": - return True - if "NO_COLOR" in os.environ: - return False - if not COLORIZE: - return False - if not sys.flags.ignore_environment: - if "FORCE_COLOR" in os.environ: - return True - if os.environ.get("TERM") == "dumb": - return False - - if not hasattr(sys.stderr, "fileno"): - return False - - try: - return os.isatty(sys.stderr.fileno()) - except io.UnsupportedOperation: - return sys.stderr.isatty() diff --git a/python/python3_13/examples/_compression.py b/python/python3_13/examples/_compression.py deleted file mode 100644 index e8b70aa0a3..0000000000 --- a/python/python3_13/examples/_compression.py +++ /dev/null @@ -1,162 +0,0 @@ -"""Internal classes used by the gzip, lzma and bz2 modules""" - -import io -import sys - -BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size - - -class BaseStream(io.BufferedIOBase): - """Mode-checking helper functions.""" - - def _check_not_closed(self): - if self.closed: - raise ValueError("I/O operation on closed file") - - def _check_can_read(self): - if not self.readable(): - raise io.UnsupportedOperation("File not open for reading") - - def _check_can_write(self): - if not self.writable(): - raise io.UnsupportedOperation("File not open for writing") - - def _check_can_seek(self): - if not self.readable(): - raise io.UnsupportedOperation("Seeking is only supported " - "on files open for reading") - if not self.seekable(): - raise io.UnsupportedOperation("The underlying file object " - "does not support seeking") - - -class DecompressReader(io.RawIOBase): - """Adapts the decompressor API to a RawIOBase reader API""" - - def readable(self): - return True - - def __init__(self, fp, decomp_factory, trailing_error=(), **decomp_args): - self._fp = fp - self._eof = False - self._pos = 0 # Current offset in decompressed stream - - # Set to size of decompressed stream once it is known, for SEEK_END - self._size = -1 - - # Save the decompressor factory and arguments. - # If the file contains multiple compressed streams, each - # stream will need a separate decompressor object. A new decompressor - # object is also needed when implementing a backwards seek(). - self._decomp_factory = decomp_factory - self._decomp_args = decomp_args - self._decompressor = self._decomp_factory(**self._decomp_args) - - # Exception class to catch from decompressor signifying invalid - # trailing data to ignore - self._trailing_error = trailing_error - - def close(self): - self._decompressor = None - return super().close() - - def seekable(self): - return self._fp.seekable() - - def readinto(self, b): - with memoryview(b) as view, view.cast("B") as byte_view: - data = self.read(len(byte_view)) - byte_view[:len(data)] = data - return len(data) - - def read(self, size=-1): - if size < 0: - return self.readall() - - if not size or self._eof: - return b"" - data = None # Default if EOF is encountered - # Depending on the input data, our call to the decompressor may not - # return any data. In this case, try again after reading another block. - while True: - if self._decompressor.eof: - rawblock = (self._decompressor.unused_data or - self._fp.read(BUFFER_SIZE)) - if not rawblock: - break - # Continue to next stream. - self._decompressor = self._decomp_factory( - **self._decomp_args) - try: - data = self._decompressor.decompress(rawblock, size) - except self._trailing_error: - # Trailing data isn't a valid compressed stream; ignore it. - break - else: - if self._decompressor.needs_input: - rawblock = self._fp.read(BUFFER_SIZE) - if not rawblock: - raise EOFError("Compressed file ended before the " - "end-of-stream marker was reached") - else: - rawblock = b"" - data = self._decompressor.decompress(rawblock, size) - if data: - break - if not data: - self._eof = True - self._size = self._pos - return b"" - self._pos += len(data) - return data - - def readall(self): - chunks = [] - # sys.maxsize means the max length of output buffer is unlimited, - # so that the whole input buffer can be decompressed within one - # .decompress() call. - while data := self.read(sys.maxsize): - chunks.append(data) - - return b"".join(chunks) - - # Rewind the file to the beginning of the data stream. - def _rewind(self): - self._fp.seek(0) - self._eof = False - self._pos = 0 - self._decompressor = self._decomp_factory(**self._decomp_args) - - def seek(self, offset, whence=io.SEEK_SET): - # Recalculate offset as an absolute file position. - if whence == io.SEEK_SET: - pass - elif whence == io.SEEK_CUR: - offset = self._pos + offset - elif whence == io.SEEK_END: - # Seeking relative to EOF - we need to know the file's size. - if self._size < 0: - while self.read(io.DEFAULT_BUFFER_SIZE): - pass - offset = self._size + offset - else: - raise ValueError("Invalid value for whence: {}".format(whence)) - - # Make it so that offset is the number of bytes to skip forward. - if offset < self._pos: - self._rewind() - else: - offset -= self._pos - - # Read and discard data until we reach the desired position. - while offset > 0: - data = self.read(min(io.DEFAULT_BUFFER_SIZE, offset)) - if not data: - break - offset -= len(data) - - return self._pos - - def tell(self): - """Return the current file position.""" - return self._pos diff --git a/python/python3_13/examples/_opcode_metadata.py b/python/python3_13/examples/_opcode_metadata.py deleted file mode 100644 index b3d7b8103e..0000000000 --- a/python/python3_13/examples/_opcode_metadata.py +++ /dev/null @@ -1,343 +0,0 @@ -# This file is generated by Tools/cases_generator/py_metadata_generator.py -# from: -# Python/bytecodes.c -# Do not edit! -_specializations = { - "RESUME": [ - "RESUME_CHECK", - ], - "TO_BOOL": [ - "TO_BOOL_ALWAYS_TRUE", - "TO_BOOL_BOOL", - "TO_BOOL_INT", - "TO_BOOL_LIST", - "TO_BOOL_NONE", - "TO_BOOL_STR", - ], - "BINARY_OP": [ - "BINARY_OP_MULTIPLY_INT", - "BINARY_OP_ADD_INT", - "BINARY_OP_SUBTRACT_INT", - "BINARY_OP_MULTIPLY_FLOAT", - "BINARY_OP_ADD_FLOAT", - "BINARY_OP_SUBTRACT_FLOAT", - "BINARY_OP_ADD_UNICODE", - "BINARY_OP_INPLACE_ADD_UNICODE", - ], - "BINARY_SUBSCR": [ - "BINARY_SUBSCR_DICT", - "BINARY_SUBSCR_GETITEM", - "BINARY_SUBSCR_LIST_INT", - "BINARY_SUBSCR_STR_INT", - "BINARY_SUBSCR_TUPLE_INT", - ], - "STORE_SUBSCR": [ - "STORE_SUBSCR_DICT", - "STORE_SUBSCR_LIST_INT", - ], - "SEND": [ - "SEND_GEN", - ], - "UNPACK_SEQUENCE": [ - "UNPACK_SEQUENCE_TWO_TUPLE", - "UNPACK_SEQUENCE_TUPLE", - "UNPACK_SEQUENCE_LIST", - ], - "STORE_ATTR": [ - "STORE_ATTR_INSTANCE_VALUE", - "STORE_ATTR_SLOT", - "STORE_ATTR_WITH_HINT", - ], - "LOAD_GLOBAL": [ - "LOAD_GLOBAL_MODULE", - "LOAD_GLOBAL_BUILTIN", - ], - "LOAD_SUPER_ATTR": [ - "LOAD_SUPER_ATTR_ATTR", - "LOAD_SUPER_ATTR_METHOD", - ], - "LOAD_ATTR": [ - "LOAD_ATTR_INSTANCE_VALUE", - "LOAD_ATTR_MODULE", - "LOAD_ATTR_WITH_HINT", - "LOAD_ATTR_SLOT", - "LOAD_ATTR_CLASS", - "LOAD_ATTR_PROPERTY", - "LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN", - "LOAD_ATTR_METHOD_WITH_VALUES", - "LOAD_ATTR_METHOD_NO_DICT", - "LOAD_ATTR_METHOD_LAZY_DICT", - "LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES", - "LOAD_ATTR_NONDESCRIPTOR_NO_DICT", - ], - "COMPARE_OP": [ - "COMPARE_OP_FLOAT", - "COMPARE_OP_INT", - "COMPARE_OP_STR", - ], - "CONTAINS_OP": [ - "CONTAINS_OP_SET", - "CONTAINS_OP_DICT", - ], - "FOR_ITER": [ - "FOR_ITER_LIST", - "FOR_ITER_TUPLE", - "FOR_ITER_RANGE", - "FOR_ITER_GEN", - ], - "CALL": [ - "CALL_BOUND_METHOD_EXACT_ARGS", - "CALL_PY_EXACT_ARGS", - "CALL_TYPE_1", - "CALL_STR_1", - "CALL_TUPLE_1", - "CALL_BUILTIN_CLASS", - "CALL_BUILTIN_O", - "CALL_BUILTIN_FAST", - "CALL_BUILTIN_FAST_WITH_KEYWORDS", - "CALL_LEN", - "CALL_ISINSTANCE", - "CALL_LIST_APPEND", - "CALL_METHOD_DESCRIPTOR_O", - "CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS", - "CALL_METHOD_DESCRIPTOR_NOARGS", - "CALL_METHOD_DESCRIPTOR_FAST", - "CALL_ALLOC_AND_ENTER_INIT", - "CALL_PY_GENERAL", - "CALL_BOUND_METHOD_GENERAL", - "CALL_NON_PY_GENERAL", - ], -} - -_specialized_opmap = { - 'BINARY_OP_ADD_FLOAT': 150, - 'BINARY_OP_ADD_INT': 151, - 'BINARY_OP_ADD_UNICODE': 152, - 'BINARY_OP_INPLACE_ADD_UNICODE': 3, - 'BINARY_OP_MULTIPLY_FLOAT': 153, - 'BINARY_OP_MULTIPLY_INT': 154, - 'BINARY_OP_SUBTRACT_FLOAT': 155, - 'BINARY_OP_SUBTRACT_INT': 156, - 'BINARY_SUBSCR_DICT': 157, - 'BINARY_SUBSCR_GETITEM': 158, - 'BINARY_SUBSCR_LIST_INT': 159, - 'BINARY_SUBSCR_STR_INT': 160, - 'BINARY_SUBSCR_TUPLE_INT': 161, - 'CALL_ALLOC_AND_ENTER_INIT': 162, - 'CALL_BOUND_METHOD_EXACT_ARGS': 163, - 'CALL_BOUND_METHOD_GENERAL': 164, - 'CALL_BUILTIN_CLASS': 165, - 'CALL_BUILTIN_FAST': 166, - 'CALL_BUILTIN_FAST_WITH_KEYWORDS': 167, - 'CALL_BUILTIN_O': 168, - 'CALL_ISINSTANCE': 169, - 'CALL_LEN': 170, - 'CALL_LIST_APPEND': 171, - 'CALL_METHOD_DESCRIPTOR_FAST': 172, - 'CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS': 173, - 'CALL_METHOD_DESCRIPTOR_NOARGS': 174, - 'CALL_METHOD_DESCRIPTOR_O': 175, - 'CALL_NON_PY_GENERAL': 176, - 'CALL_PY_EXACT_ARGS': 177, - 'CALL_PY_GENERAL': 178, - 'CALL_STR_1': 179, - 'CALL_TUPLE_1': 180, - 'CALL_TYPE_1': 181, - 'COMPARE_OP_FLOAT': 182, - 'COMPARE_OP_INT': 183, - 'COMPARE_OP_STR': 184, - 'CONTAINS_OP_DICT': 185, - 'CONTAINS_OP_SET': 186, - 'FOR_ITER_GEN': 187, - 'FOR_ITER_LIST': 188, - 'FOR_ITER_RANGE': 189, - 'FOR_ITER_TUPLE': 190, - 'LOAD_ATTR_CLASS': 191, - 'LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN': 192, - 'LOAD_ATTR_INSTANCE_VALUE': 193, - 'LOAD_ATTR_METHOD_LAZY_DICT': 194, - 'LOAD_ATTR_METHOD_NO_DICT': 195, - 'LOAD_ATTR_METHOD_WITH_VALUES': 196, - 'LOAD_ATTR_MODULE': 197, - 'LOAD_ATTR_NONDESCRIPTOR_NO_DICT': 198, - 'LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES': 199, - 'LOAD_ATTR_PROPERTY': 200, - 'LOAD_ATTR_SLOT': 201, - 'LOAD_ATTR_WITH_HINT': 202, - 'LOAD_GLOBAL_BUILTIN': 203, - 'LOAD_GLOBAL_MODULE': 204, - 'LOAD_SUPER_ATTR_ATTR': 205, - 'LOAD_SUPER_ATTR_METHOD': 206, - 'RESUME_CHECK': 207, - 'SEND_GEN': 208, - 'STORE_ATTR_INSTANCE_VALUE': 209, - 'STORE_ATTR_SLOT': 210, - 'STORE_ATTR_WITH_HINT': 211, - 'STORE_SUBSCR_DICT': 212, - 'STORE_SUBSCR_LIST_INT': 213, - 'TO_BOOL_ALWAYS_TRUE': 214, - 'TO_BOOL_BOOL': 215, - 'TO_BOOL_INT': 216, - 'TO_BOOL_LIST': 217, - 'TO_BOOL_NONE': 218, - 'TO_BOOL_STR': 219, - 'UNPACK_SEQUENCE_LIST': 220, - 'UNPACK_SEQUENCE_TUPLE': 221, - 'UNPACK_SEQUENCE_TWO_TUPLE': 222, -} - -opmap = { - 'CACHE': 0, - 'RESERVED': 17, - 'RESUME': 149, - 'INSTRUMENTED_LINE': 254, - 'BEFORE_ASYNC_WITH': 1, - 'BEFORE_WITH': 2, - 'BINARY_SLICE': 4, - 'BINARY_SUBSCR': 5, - 'CHECK_EG_MATCH': 6, - 'CHECK_EXC_MATCH': 7, - 'CLEANUP_THROW': 8, - 'DELETE_SUBSCR': 9, - 'END_ASYNC_FOR': 10, - 'END_FOR': 11, - 'END_SEND': 12, - 'EXIT_INIT_CHECK': 13, - 'FORMAT_SIMPLE': 14, - 'FORMAT_WITH_SPEC': 15, - 'GET_AITER': 16, - 'GET_ANEXT': 18, - 'GET_ITER': 19, - 'GET_LEN': 20, - 'GET_YIELD_FROM_ITER': 21, - 'INTERPRETER_EXIT': 22, - 'LOAD_ASSERTION_ERROR': 23, - 'LOAD_BUILD_CLASS': 24, - 'LOAD_LOCALS': 25, - 'MAKE_FUNCTION': 26, - 'MATCH_KEYS': 27, - 'MATCH_MAPPING': 28, - 'MATCH_SEQUENCE': 29, - 'NOP': 30, - 'POP_EXCEPT': 31, - 'POP_TOP': 32, - 'PUSH_EXC_INFO': 33, - 'PUSH_NULL': 34, - 'RETURN_GENERATOR': 35, - 'RETURN_VALUE': 36, - 'SETUP_ANNOTATIONS': 37, - 'STORE_SLICE': 38, - 'STORE_SUBSCR': 39, - 'TO_BOOL': 40, - 'UNARY_INVERT': 41, - 'UNARY_NEGATIVE': 42, - 'UNARY_NOT': 43, - 'WITH_EXCEPT_START': 44, - 'BINARY_OP': 45, - 'BUILD_CONST_KEY_MAP': 46, - 'BUILD_LIST': 47, - 'BUILD_MAP': 48, - 'BUILD_SET': 49, - 'BUILD_SLICE': 50, - 'BUILD_STRING': 51, - 'BUILD_TUPLE': 52, - 'CALL': 53, - 'CALL_FUNCTION_EX': 54, - 'CALL_INTRINSIC_1': 55, - 'CALL_INTRINSIC_2': 56, - 'CALL_KW': 57, - 'COMPARE_OP': 58, - 'CONTAINS_OP': 59, - 'CONVERT_VALUE': 60, - 'COPY': 61, - 'COPY_FREE_VARS': 62, - 'DELETE_ATTR': 63, - 'DELETE_DEREF': 64, - 'DELETE_FAST': 65, - 'DELETE_GLOBAL': 66, - 'DELETE_NAME': 67, - 'DICT_MERGE': 68, - 'DICT_UPDATE': 69, - 'ENTER_EXECUTOR': 70, - 'EXTENDED_ARG': 71, - 'FOR_ITER': 72, - 'GET_AWAITABLE': 73, - 'IMPORT_FROM': 74, - 'IMPORT_NAME': 75, - 'IS_OP': 76, - 'JUMP_BACKWARD': 77, - 'JUMP_BACKWARD_NO_INTERRUPT': 78, - 'JUMP_FORWARD': 79, - 'LIST_APPEND': 80, - 'LIST_EXTEND': 81, - 'LOAD_ATTR': 82, - 'LOAD_CONST': 83, - 'LOAD_DEREF': 84, - 'LOAD_FAST': 85, - 'LOAD_FAST_AND_CLEAR': 86, - 'LOAD_FAST_CHECK': 87, - 'LOAD_FAST_LOAD_FAST': 88, - 'LOAD_FROM_DICT_OR_DEREF': 89, - 'LOAD_FROM_DICT_OR_GLOBALS': 90, - 'LOAD_GLOBAL': 91, - 'LOAD_NAME': 92, - 'LOAD_SUPER_ATTR': 93, - 'MAKE_CELL': 94, - 'MAP_ADD': 95, - 'MATCH_CLASS': 96, - 'POP_JUMP_IF_FALSE': 97, - 'POP_JUMP_IF_NONE': 98, - 'POP_JUMP_IF_NOT_NONE': 99, - 'POP_JUMP_IF_TRUE': 100, - 'RAISE_VARARGS': 101, - 'RERAISE': 102, - 'RETURN_CONST': 103, - 'SEND': 104, - 'SET_ADD': 105, - 'SET_FUNCTION_ATTRIBUTE': 106, - 'SET_UPDATE': 107, - 'STORE_ATTR': 108, - 'STORE_DEREF': 109, - 'STORE_FAST': 110, - 'STORE_FAST_LOAD_FAST': 111, - 'STORE_FAST_STORE_FAST': 112, - 'STORE_GLOBAL': 113, - 'STORE_NAME': 114, - 'SWAP': 115, - 'UNPACK_EX': 116, - 'UNPACK_SEQUENCE': 117, - 'YIELD_VALUE': 118, - 'INSTRUMENTED_RESUME': 236, - 'INSTRUMENTED_END_FOR': 237, - 'INSTRUMENTED_END_SEND': 238, - 'INSTRUMENTED_RETURN_VALUE': 239, - 'INSTRUMENTED_RETURN_CONST': 240, - 'INSTRUMENTED_YIELD_VALUE': 241, - 'INSTRUMENTED_LOAD_SUPER_ATTR': 242, - 'INSTRUMENTED_FOR_ITER': 243, - 'INSTRUMENTED_CALL': 244, - 'INSTRUMENTED_CALL_KW': 245, - 'INSTRUMENTED_CALL_FUNCTION_EX': 246, - 'INSTRUMENTED_INSTRUCTION': 247, - 'INSTRUMENTED_JUMP_FORWARD': 248, - 'INSTRUMENTED_JUMP_BACKWARD': 249, - 'INSTRUMENTED_POP_JUMP_IF_TRUE': 250, - 'INSTRUMENTED_POP_JUMP_IF_FALSE': 251, - 'INSTRUMENTED_POP_JUMP_IF_NONE': 252, - 'INSTRUMENTED_POP_JUMP_IF_NOT_NONE': 253, - 'JUMP': 256, - 'JUMP_NO_INTERRUPT': 257, - 'LOAD_CLOSURE': 258, - 'LOAD_METHOD': 259, - 'LOAD_SUPER_METHOD': 260, - 'LOAD_ZERO_SUPER_ATTR': 261, - 'LOAD_ZERO_SUPER_METHOD': 262, - 'POP_BLOCK': 263, - 'SETUP_CLEANUP': 264, - 'SETUP_FINALLY': 265, - 'SETUP_WITH': 266, - 'STORE_FAST_MAYBE_NULL': 267, -} - -HAVE_ARGUMENT = 44 -MIN_INSTRUMENTED_OPCODE = 236 diff --git a/python/python3_13/examples/_pylong.py b/python/python3_13/examples/_pylong.py deleted file mode 100644 index 4970eb3fa6..0000000000 --- a/python/python3_13/examples/_pylong.py +++ /dev/null @@ -1,363 +0,0 @@ -"""Python implementations of some algorithms for use by longobject.c. -The goal is to provide asymptotically faster algorithms that can be -used for operations on integers with many digits. In those cases, the -performance overhead of the Python implementation is not significant -since the asymptotic behavior is what dominates runtime. Functions -provided by this module should be considered private and not part of any -public API. - -Note: for ease of maintainability, please prefer clear code and avoid -"micro-optimizations". This module will only be imported and used for -integers with a huge number of digits. Saving a few microseconds with -tricky or non-obvious code is not worth it. For people looking for -maximum performance, they should use something like gmpy2.""" - -import re -import decimal -try: - import _decimal -except ImportError: - _decimal = None - -# A number of functions have this form, where `w` is a desired number of -# digits in base `base`: -# -# def inner(...w...): -# if w <= LIMIT: -# return something -# lo = w >> 1 -# hi = w - lo -# something involving base**lo, inner(...lo...), j, and inner(...hi...) -# figure out largest w needed -# result = inner(w) -# -# They all had some on-the-fly scheme to cache `base**lo` results for reuse. -# Power is costly. -# -# This routine aims to compute all amd only the needed powers in advance, as -# efficiently as reasonably possible. This isn't trivial, and all the -# on-the-fly methods did needless work in many cases. The driving code above -# changes to: -# -# figure out largest w needed -# mycache = compute_powers(w, base, LIMIT) -# result = inner(w) -# -# and `mycache[lo]` replaces `base**lo` in the inner function. -# -# While this does give minor speedups (a few percent at best), the primary -# intent is to simplify the functions using this, by eliminating the need for -# them to craft their own ad-hoc caching schemes. -def compute_powers(w, base, more_than, show=False): - seen = set() - need = set() - ws = {w} - while ws: - w = ws.pop() # any element is fine to use next - if w in seen or w <= more_than: - continue - seen.add(w) - lo = w >> 1 - # only _need_ lo here; some other path may, or may not, need hi - need.add(lo) - ws.add(lo) - if w & 1: - ws.add(lo + 1) - - d = {} - if not need: - return d - it = iter(sorted(need)) - first = next(it) - if show: - print("pow at", first) - d[first] = base ** first - for this in it: - if this - 1 in d: - if show: - print("* base at", this) - d[this] = d[this - 1] * base # cheap - else: - lo = this >> 1 - hi = this - lo - assert lo in d - if show: - print("square at", this) - # Multiplying a bigint by itself (same object!) is about twice - # as fast in CPython. - sq = d[lo] * d[lo] - if hi != lo: - assert hi == lo + 1 - if show: - print(" and * base") - sq *= base - d[this] = sq - return d - -_unbounded_dec_context = decimal.getcontext().copy() -_unbounded_dec_context.prec = decimal.MAX_PREC -_unbounded_dec_context.Emax = decimal.MAX_EMAX -_unbounded_dec_context.Emin = decimal.MIN_EMIN -_unbounded_dec_context.traps[decimal.Inexact] = 1 # sanity check - -def int_to_decimal(n): - """Asymptotically fast conversion of an 'int' to Decimal.""" - - # Function due to Tim Peters. See GH issue #90716 for details. - # https://github.com/python/cpython/issues/90716 - # - # The implementation in longobject.c of base conversion algorithms - # between power-of-2 and non-power-of-2 bases are quadratic time. - # This function implements a divide-and-conquer algorithm that is - # faster for large numbers. Builds an equal decimal.Decimal in a - # "clever" recursive way. If we want a string representation, we - # apply str to _that_. - - from decimal import Decimal as D - BITLIM = 200 - - # Don't bother caching the "lo" mask in this; the time to compute it is - # tiny compared to the multiply. - def inner(n, w): - if w <= BITLIM: - return D(n) - w2 = w >> 1 - hi = n >> w2 - lo = n & ((1 << w2) - 1) - return inner(lo, w2) + inner(hi, w - w2) * w2pow[w2] - - with decimal.localcontext(_unbounded_dec_context): - nbits = n.bit_length() - w2pow = compute_powers(nbits, D(2), BITLIM) - if n < 0: - negate = True - n = -n - else: - negate = False - result = inner(n, nbits) - if negate: - result = -result - return result - -def int_to_decimal_string(n): - """Asymptotically fast conversion of an 'int' to a decimal string.""" - w = n.bit_length() - if w > 450_000 and _decimal is not None: - # It is only usable with the C decimal implementation. - # _pydecimal.py calls str() on very large integers, which in its - # turn calls int_to_decimal_string(), causing very deep recursion. - return str(int_to_decimal(n)) - - # Fallback algorithm for the case when the C decimal module isn't - # available. This algorithm is asymptotically worse than the algorithm - # using the decimal module, but better than the quadratic time - # implementation in longobject.c. - - DIGLIM = 1000 - def inner(n, w): - if w <= DIGLIM: - return str(n) - w2 = w >> 1 - hi, lo = divmod(n, pow10[w2]) - return inner(hi, w - w2) + inner(lo, w2).zfill(w2) - - # The estimation of the number of decimal digits. - # There is no harm in small error. If we guess too large, there may - # be leading 0's that need to be stripped. If we guess too small, we - # may need to call str() recursively for the remaining highest digits, - # which can still potentially be a large integer. This is manifested - # only if the number has way more than 10**15 digits, that exceeds - # the 52-bit physical address limit in both Intel64 and AMD64. - w = int(w * 0.3010299956639812 + 1) # log10(2) - pow10 = compute_powers(w, 5, DIGLIM) - for k, v in pow10.items(): - pow10[k] = v << k # 5**k << k == 5**k * 2**k == 10**k - if n < 0: - n = -n - sign = '-' - else: - sign = '' - s = inner(n, w) - if s[0] == '0' and n: - # If our guess of w is too large, there may be leading 0's that - # need to be stripped. - s = s.lstrip('0') - return sign + s - -def _str_to_int_inner(s): - """Asymptotically fast conversion of a 'str' to an 'int'.""" - - # Function due to Bjorn Martinsson. See GH issue #90716 for details. - # https://github.com/python/cpython/issues/90716 - # - # The implementation in longobject.c of base conversion algorithms - # between power-of-2 and non-power-of-2 bases are quadratic time. - # This function implements a divide-and-conquer algorithm making use - # of Python's built in big int multiplication. Since Python uses the - # Karatsuba algorithm for multiplication, the time complexity - # of this function is O(len(s)**1.58). - - DIGLIM = 2048 - - def inner(a, b): - if b - a <= DIGLIM: - return int(s[a:b]) - mid = (a + b + 1) >> 1 - return (inner(mid, b) - + ((inner(a, mid) * w5pow[b - mid]) - << (b - mid))) - - w5pow = compute_powers(len(s), 5, DIGLIM) - return inner(0, len(s)) - - -def int_from_string(s): - """Asymptotically fast version of PyLong_FromString(), conversion - of a string of decimal digits into an 'int'.""" - # PyLong_FromString() has already removed leading +/-, checked for invalid - # use of underscore characters, checked that string consists of only digits - # and underscores, and stripped leading whitespace. The input can still - # contain underscores and have trailing whitespace. - s = s.rstrip().replace('_', '') - return _str_to_int_inner(s) - -def str_to_int(s): - """Asymptotically fast version of decimal string to 'int' conversion.""" - # FIXME: this doesn't support the full syntax that int() supports. - m = re.match(r'\s*([+-]?)([0-9_]+)\s*', s) - if not m: - raise ValueError('invalid literal for int() with base 10') - v = int_from_string(m.group(2)) - if m.group(1) == '-': - v = -v - return v - - -# Fast integer division, based on code from Mark Dickinson, fast_div.py -# GH-47701. Additional refinements and optimizations by Bjorn Martinsson. The -# algorithm is due to Burnikel and Ziegler, in their paper "Fast Recursive -# Division". - -_DIV_LIMIT = 4000 - - -def _div2n1n(a, b, n): - """Divide a 2n-bit nonnegative integer a by an n-bit positive integer - b, using a recursive divide-and-conquer algorithm. - - Inputs: - n is a positive integer - b is a positive integer with exactly n bits - a is a nonnegative integer such that a < 2**n * b - - Output: - (q, r) such that a = b*q+r and 0 <= r < b. - - """ - if a.bit_length() - n <= _DIV_LIMIT: - return divmod(a, b) - pad = n & 1 - if pad: - a <<= 1 - b <<= 1 - n += 1 - half_n = n >> 1 - mask = (1 << half_n) - 1 - b1, b2 = b >> half_n, b & mask - q1, r = _div3n2n(a >> n, (a >> half_n) & mask, b, b1, b2, half_n) - q2, r = _div3n2n(r, a & mask, b, b1, b2, half_n) - if pad: - r >>= 1 - return q1 << half_n | q2, r - - -def _div3n2n(a12, a3, b, b1, b2, n): - """Helper function for _div2n1n; not intended to be called directly.""" - if a12 >> n == b1: - q, r = (1 << n) - 1, a12 - (b1 << n) + b1 - else: - q, r = _div2n1n(a12, b1, n) - r = (r << n | a3) - q * b2 - while r < 0: - q -= 1 - r += b - return q, r - - -def _int2digits(a, n): - """Decompose non-negative int a into base 2**n - - Input: - a is a non-negative integer - - Output: - List of the digits of a in base 2**n in little-endian order, - meaning the most significant digit is last. The most - significant digit is guaranteed to be non-zero. - If a is 0 then the output is an empty list. - - """ - a_digits = [0] * ((a.bit_length() + n - 1) // n) - - def inner(x, L, R): - if L + 1 == R: - a_digits[L] = x - return - mid = (L + R) >> 1 - shift = (mid - L) * n - upper = x >> shift - lower = x ^ (upper << shift) - inner(lower, L, mid) - inner(upper, mid, R) - - if a: - inner(a, 0, len(a_digits)) - return a_digits - - -def _digits2int(digits, n): - """Combine base-2**n digits into an int. This function is the - inverse of `_int2digits`. For more details, see _int2digits. - """ - - def inner(L, R): - if L + 1 == R: - return digits[L] - mid = (L + R) >> 1 - shift = (mid - L) * n - return (inner(mid, R) << shift) + inner(L, mid) - - return inner(0, len(digits)) if digits else 0 - - -def _divmod_pos(a, b): - """Divide a non-negative integer a by a positive integer b, giving - quotient and remainder.""" - # Use grade-school algorithm in base 2**n, n = nbits(b) - n = b.bit_length() - a_digits = _int2digits(a, n) - - r = 0 - q_digits = [] - for a_digit in reversed(a_digits): - q_digit, r = _div2n1n((r << n) + a_digit, b, n) - q_digits.append(q_digit) - q_digits.reverse() - q = _digits2int(q_digits, n) - return q, r - - -def int_divmod(a, b): - """Asymptotically fast replacement for divmod, for 'int'. - Its time complexity is O(n**1.58), where n = #bits(a) + #bits(b). - """ - if b == 0: - raise ZeroDivisionError - elif b < 0: - q, r = int_divmod(-a, -b) - return q, -r - elif a < 0: - q, r = int_divmod(~a, b) - return ~q, b + ~r - else: - return _divmod_pos(a, b) diff --git a/python/python3_13/examples/_threading_local.py b/python/python3_13/examples/_threading_local.py deleted file mode 100644 index b006d76c4e..0000000000 --- a/python/python3_13/examples/_threading_local.py +++ /dev/null @@ -1,242 +0,0 @@ -"""Thread-local objects. - -(Note that this module provides a Python version of the threading.local - class. Depending on the version of Python you're using, there may be a - faster one available. You should always import the `local` class from - `threading`.) - -Thread-local objects support the management of thread-local data. -If you have data that you want to be local to a thread, simply create -a thread-local object and use its attributes: - - >>> mydata = local() - >>> mydata.number = 42 - >>> mydata.number - 42 - -You can also access the local-object's dictionary: - - >>> mydata.__dict__ - {'number': 42} - >>> mydata.__dict__.setdefault('widgets', []) - [] - >>> mydata.widgets - [] - -What's important about thread-local objects is that their data are -local to a thread. If we access the data in a different thread: - - >>> log = [] - >>> def f(): - ... items = sorted(mydata.__dict__.items()) - ... log.append(items) - ... mydata.number = 11 - ... log.append(mydata.number) - - >>> import threading - >>> thread = threading.Thread(target=f) - >>> thread.start() - >>> thread.join() - >>> log - [[], 11] - -we get different data. Furthermore, changes made in the other thread -don't affect data seen in this thread: - - >>> mydata.number - 42 - -Of course, values you get from a local object, including a __dict__ -attribute, are for whatever thread was current at the time the -attribute was read. For that reason, you generally don't want to save -these values across threads, as they apply only to the thread they -came from. - -You can create custom local objects by subclassing the local class: - - >>> class MyLocal(local): - ... number = 2 - ... def __init__(self, /, **kw): - ... self.__dict__.update(kw) - ... def squared(self): - ... return self.number ** 2 - -This can be useful to support default values, methods and -initialization. Note that if you define an __init__ method, it will be -called each time the local object is used in a separate thread. This -is necessary to initialize each thread's dictionary. - -Now if we create a local object: - - >>> mydata = MyLocal(color='red') - -Now we have a default number: - - >>> mydata.number - 2 - -an initial color: - - >>> mydata.color - 'red' - >>> del mydata.color - -And a method that operates on the data: - - >>> mydata.squared() - 4 - -As before, we can access the data in a separate thread: - - >>> log = [] - >>> thread = threading.Thread(target=f) - >>> thread.start() - >>> thread.join() - >>> log - [[('color', 'red')], 11] - -without affecting this thread's data: - - >>> mydata.number - 2 - >>> mydata.color - Traceback (most recent call last): - ... - AttributeError: 'MyLocal' object has no attribute 'color' - -Note that subclasses can define slots, but they are not thread -local. They are shared across threads: - - >>> class MyLocal(local): - ... __slots__ = 'number' - - >>> mydata = MyLocal() - >>> mydata.number = 42 - >>> mydata.color = 'red' - -So, the separate thread: - - >>> thread = threading.Thread(target=f) - >>> thread.start() - >>> thread.join() - -affects what we see: - - >>> mydata.number - 11 - ->>> del mydata -""" - -from weakref import ref -from contextlib import contextmanager - -__all__ = ["local"] - -# We need to use objects from the threading module, but the threading -# module may also want to use our `local` class, if support for locals -# isn't compiled in to the `thread` module. This creates potential problems -# with circular imports. For that reason, we don't import `threading` -# until the bottom of this file (a hack sufficient to worm around the -# potential problems). Note that all platforms on CPython do have support -# for locals in the `thread` module, and there is no circular import problem -# then, so problems introduced by fiddling the order of imports here won't -# manifest. - -class _localimpl: - """A class managing thread-local dicts""" - __slots__ = 'key', 'dicts', 'localargs', 'locallock', '__weakref__' - - def __init__(self): - # The key used in the Thread objects' attribute dicts. - # We keep it a string for speed but make it unlikely to clash with - # a "real" attribute. - self.key = '_threading_local._localimpl.' + str(id(self)) - # { id(Thread) -> (ref(Thread), thread-local dict) } - self.dicts = {} - - def get_dict(self): - """Return the dict for the current thread. Raises KeyError if none - defined.""" - thread = current_thread() - return self.dicts[id(thread)][1] - - def create_dict(self): - """Create a new dict for the current thread, and return it.""" - localdict = {} - key = self.key - thread = current_thread() - idt = id(thread) - def local_deleted(_, key=key): - # When the localimpl is deleted, remove the thread attribute. - thread = wrthread() - if thread is not None: - del thread.__dict__[key] - def thread_deleted(_, idt=idt): - # When the thread is deleted, remove the local dict. - # Note that this is suboptimal if the thread object gets - # caught in a reference loop. We would like to be called - # as soon as the OS-level thread ends instead. - local = wrlocal() - if local is not None: - dct = local.dicts.pop(idt) - wrlocal = ref(self, local_deleted) - wrthread = ref(thread, thread_deleted) - thread.__dict__[key] = wrlocal - self.dicts[idt] = wrthread, localdict - return localdict - - -@contextmanager -def _patch(self): - impl = object.__getattribute__(self, '_local__impl') - try: - dct = impl.get_dict() - except KeyError: - dct = impl.create_dict() - args, kw = impl.localargs - self.__init__(*args, **kw) - with impl.locallock: - object.__setattr__(self, '__dict__', dct) - yield - - -class local: - __slots__ = '_local__impl', '__dict__' - - def __new__(cls, /, *args, **kw): - if (args or kw) and (cls.__init__ is object.__init__): - raise TypeError("Initialization arguments are not supported") - self = object.__new__(cls) - impl = _localimpl() - impl.localargs = (args, kw) - impl.locallock = RLock() - object.__setattr__(self, '_local__impl', impl) - # We need to create the thread dict in anticipation of - # __init__ being called, to make sure we don't call it - # again ourselves. - impl.create_dict() - return self - - def __getattribute__(self, name): - with _patch(self): - return object.__getattribute__(self, name) - - def __setattr__(self, name, value): - if name == '__dict__': - raise AttributeError( - "%r object attribute '__dict__' is read-only" - % self.__class__.__name__) - with _patch(self): - return object.__setattr__(self, name, value) - - def __delattr__(self, name): - if name == '__dict__': - raise AttributeError( - "%r object attribute '__dict__' is read-only" - % self.__class__.__name__) - with _patch(self): - return object.__delattr__(self, name) - - -from threading import current_thread, RLock diff --git a/python/python3_13/examples/_weakrefset.py b/python/python3_13/examples/_weakrefset.py deleted file mode 100644 index 489eec714e..0000000000 --- a/python/python3_13/examples/_weakrefset.py +++ /dev/null @@ -1,205 +0,0 @@ -# Access WeakSet through the weakref module. -# This code is separated-out because it is needed -# by abc.py to load everything else at startup. - -from _weakref import ref -from types import GenericAlias - -__all__ = ['WeakSet'] - - -class _IterationGuard: - # This context manager registers itself in the current iterators of the - # weak container, such as to delay all removals until the context manager - # exits. - # This technique should be relatively thread-safe (since sets are). - - def __init__(self, weakcontainer): - # Don't create cycles - self.weakcontainer = ref(weakcontainer) - - def __enter__(self): - w = self.weakcontainer() - if w is not None: - w._iterating.add(self) - return self - - def __exit__(self, e, t, b): - w = self.weakcontainer() - if w is not None: - s = w._iterating - s.remove(self) - if not s: - w._commit_removals() - - -class WeakSet: - def __init__(self, data=None): - self.data = set() - def _remove(item, selfref=ref(self)): - self = selfref() - if self is not None: - if self._iterating: - self._pending_removals.append(item) - else: - self.data.discard(item) - self._remove = _remove - # A list of keys to be removed - self._pending_removals = [] - self._iterating = set() - if data is not None: - self.update(data) - - def _commit_removals(self): - pop = self._pending_removals.pop - discard = self.data.discard - while True: - try: - item = pop() - except IndexError: - return - discard(item) - - def __iter__(self): - with _IterationGuard(self): - for itemref in self.data: - item = itemref() - if item is not None: - # Caveat: the iterator will keep a strong reference to - # `item` until it is resumed or closed. - yield item - - def __len__(self): - return len(self.data) - len(self._pending_removals) - - def __contains__(self, item): - try: - wr = ref(item) - except TypeError: - return False - return wr in self.data - - def __reduce__(self): - return self.__class__, (list(self),), self.__getstate__() - - def add(self, item): - if self._pending_removals: - self._commit_removals() - self.data.add(ref(item, self._remove)) - - def clear(self): - if self._pending_removals: - self._commit_removals() - self.data.clear() - - def copy(self): - return self.__class__(self) - - def pop(self): - if self._pending_removals: - self._commit_removals() - while True: - try: - itemref = self.data.pop() - except KeyError: - raise KeyError('pop from empty WeakSet') from None - item = itemref() - if item is not None: - return item - - def remove(self, item): - if self._pending_removals: - self._commit_removals() - self.data.remove(ref(item)) - - def discard(self, item): - if self._pending_removals: - self._commit_removals() - self.data.discard(ref(item)) - - def update(self, other): - if self._pending_removals: - self._commit_removals() - for element in other: - self.add(element) - - def __ior__(self, other): - self.update(other) - return self - - def difference(self, other): - newset = self.copy() - newset.difference_update(other) - return newset - __sub__ = difference - - def difference_update(self, other): - self.__isub__(other) - def __isub__(self, other): - if self._pending_removals: - self._commit_removals() - if self is other: - self.data.clear() - else: - self.data.difference_update(ref(item) for item in other) - return self - - def intersection(self, other): - return self.__class__(item for item in other if item in self) - __and__ = intersection - - def intersection_update(self, other): - self.__iand__(other) - def __iand__(self, other): - if self._pending_removals: - self._commit_removals() - self.data.intersection_update(ref(item) for item in other) - return self - - def issubset(self, other): - return self.data.issubset(ref(item) for item in other) - __le__ = issubset - - def __lt__(self, other): - return self.data < set(map(ref, other)) - - def issuperset(self, other): - return self.data.issuperset(ref(item) for item in other) - __ge__ = issuperset - - def __gt__(self, other): - return self.data > set(map(ref, other)) - - def __eq__(self, other): - if not isinstance(other, self.__class__): - return NotImplemented - return self.data == set(map(ref, other)) - - def symmetric_difference(self, other): - newset = self.copy() - newset.symmetric_difference_update(other) - return newset - __xor__ = symmetric_difference - - def symmetric_difference_update(self, other): - self.__ixor__(other) - def __ixor__(self, other): - if self._pending_removals: - self._commit_removals() - if self is other: - self.data.clear() - else: - self.data.symmetric_difference_update(ref(item, self._remove) for item in other) - return self - - def union(self, other): - return self.__class__(e for s in (self, other) for e in s) - __or__ = union - - def isdisjoint(self, other): - return len(self.intersection(other)) == 0 - - def __repr__(self): - return repr(self.data) - - __class_getitem__ = classmethod(GenericAlias) diff --git a/python/python3_14/CSharp/PythonLexerBase.cs b/python/python3_14/CSharp/PythonLexerBase.cs new file mode 100644 index 0000000000..02d87ba3dd --- /dev/null +++ b/python/python3_14/CSharp/PythonLexerBase.cs @@ -0,0 +1,877 @@ +/* +The MIT License (MIT) +Copyright (c) 2021 Robert Einhorn + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + */ + +/* + * Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation, + * interpolated strings, and encoding declaration. + * + * Developed by : Robert Einhorn + */ + +using Antlr4.Runtime; + +[assembly: CLSCompliant(true)] + +public abstract class PythonLexerBase : Lexer +{ + private static readonly Dictionary LEXER_MODES_FOR_ISTRING_START = []; + + private const int INVALID_LENGTH = -1; + private const string ERR_TXT = " ERROR: "; + private const int TAB_LENGTH = 8; + + private string encodingName = ""; + + // Indentation handling + private Stack indentLengthStack = new(); + private LinkedList pendingTokens = new(); + + private int previousPendingTokenType; + private int lastPendingTokenTypeFromDefaultChannel; + + // Parenthesis / bracket / brace counts + private int opened; + private Stack paren_or_bracket_openedStack = new(); + private Stack braceExpressionStack = new(); + private string prevBraceExpression = ""; + + // Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE) + private int curISTRING_MIDDLEtokenType; + + // We reimplement mode/stack because not all runtimes expose _mode/_modeStack + private int curLexerMode; + private Stack lexerModeStack = new(); + + // Indentation diagnostics + private bool wasSpaceIndentation; + private bool wasTabIndentation; + private bool wasIndentationMixedWithSpacesAndTabs; + + // Current / lookahead tokens + private IToken curToken = null!; + private IToken ffgToken = null!; + + protected PythonLexerBase(ICharStream input) + : this(input, Console.Out, Console.Error) { } + + protected PythonLexerBase(ICharStream input, TextWriter output, TextWriter errorOutput) + : base(input, output, errorOutput) { } + + public override void Reset() + { + this.Init(); + base.Reset(); + } + + private void Init() + { + this.encodingName = ""; + this.indentLengthStack = new(); + this.pendingTokens = new(); + this.previousPendingTokenType = 0; + this.lastPendingTokenTypeFromDefaultChannel = 0; + this.opened = 0; + this.paren_or_bracket_openedStack = new(); + this.braceExpressionStack = new(); + this.prevBraceExpression = ""; + this.curISTRING_MIDDLEtokenType = 0; + this.curLexerMode = Lexer.DEFAULT_MODE; + this.lexerModeStack = new(); + this.wasSpaceIndentation = false; + this.wasTabIndentation = false; + this.wasIndentationMixedWithSpacesAndTabs = false; + this.curToken = null!; + this.ffgToken = null!; + } + + /// + /// Sets the encoding name to emit an ENCODING token at the start of the token stream. + /// Leave empty if not needed (e.g., when parsing from string). + /// + /// + /// The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token. + /// + public void SetEncodingName(string encodingName) + { + this.encodingName = encodingName; + } + + public override IToken NextToken() // Reading the input stream until EOF is reached + { + this.CheckNextToken(); + IToken firstPendingToken = this.pendingTokens.First!.Value; + this.pendingTokens.RemoveFirst(); + return firstPendingToken; // Add the queued token to the token stream + } + + private void CheckNextToken() + { + if (this.previousPendingTokenType == TokenConstants.EOF) + return; + + this.SetCurrentAndFollowingTokens(); + if (this.indentLengthStack.Count == 0) // We're at the first token + { + this.HandleStartOfInput(); + } + + switch (this.curToken.Type) + { + case PythonLexer.NEWLINE: + this.HandleNEWLINEtoken(); + break; + case PythonLexer.LPAR: + case PythonLexer.LSQB: + case PythonLexer.LBRACE: + this.opened++; + this.AddPendingToken(this.curToken); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + case PythonLexer.RBRACE: + this.opened--; + this.AddPendingToken(this.curToken); + break; + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + this.HandleISTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field + this.AddPendingToken(this.curToken); + break; + case PythonLexer.COLONEQUAL: + this.HandleCOLONEQUALtokenInIString(); + break; + case PythonLexer.ERRORTOKEN: + ReportLexerError($"token recognition error at: '{curToken.Text}'"); + this.AddPendingToken(this.curToken); + break; + case TokenConstants.EOF: + this.HandleEOFtoken(); + break; + default: + this.AddPendingToken(this.curToken); + break; + } + this.HandleFORMAT_SPECIFICATION_MODE(); + } + + private void SetCurrentAndFollowingTokens() + { + this.curToken = this.ffgToken == null ? + base.NextToken() : + this.ffgToken; + + this.CheckCurToken(); // Do not use ffgToken in this method or any of its submethods — it hasn't been set yet! + + this.ffgToken = this.curToken.Type == TokenConstants.EOF ? + this.curToken : + base.NextToken(); + } + + // - initialize indent stack + // - skip BOM token + // - insert ENCODING token (if any) + // - hide leading NEWLINE(s) + // - insert leading INDENT if first statement is indented + private void HandleStartOfInput() + { + this.indentLengthStack.Push(0); // this will never be popped off + + if (this.curToken.Type == PythonLexer.BOM) + { + this.SetCurrentAndFollowingTokens(); + } + this.InsertENCODINGtoken(); + + while (this.curToken.Type != TokenConstants.EOF) + { + if (this.curToken.Channel == TokenConstants.DefaultChannel) + { + if (this.curToken.Type == PythonLexer.NEWLINE) + { + // all the NEWLINE tokens must be ignored before the first statement + this.HideAndAddPendingToken(this.curToken); + } + else + { // We're at the first statement + this.InsertLeadingIndentToken(); + return; // continue the processing of the current token with CheckNextToken() + } + } + else + { + this.AddPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING, or COMMENT token + } + this.SetCurrentAndFollowingTokens(); + } // continue the processing of the EOF token with CheckNextToken() + } + + private void InsertENCODINGtoken() // https://peps.python.org/pep-0263/ + { + if (this.encodingName == "") return; + + var sourcePair = new Tuple(this, (ICharStream)this.InputStream); + var encodingToken = new CommonToken(sourcePair, PythonLexer.ENCODING, TokenConstants.HiddenChannel, start: 0, stop: 0); + encodingToken.Text = this.encodingName; + encodingToken.Line = 0; + encodingToken.Column = -1; + AddPendingToken(encodingToken); + } + + private void InsertLeadingIndentToken() + { + if (this.previousPendingTokenType == PythonLexer.WS) + { + var prevToken = this.pendingTokens.Last!.Value; + if (this.GetIndentationLength(prevToken.Text) != 0) // there is an "indentation" before the first statement + { + const string errMsg = "first statement indented"; + this.ReportLexerError(errMsg); + // insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser + this.CreateAndAddPendingToken(PythonLexer.INDENT, PythonLexerBase.ERR_TXT + errMsg, this.curToken); + } + } + } + + private void HandleNEWLINEtoken() + { + if (this.lexerModeStack.Count > 0) // for multi line f/t-string literals + { + this.AddPendingToken(this.curToken); + return; + } + + if (this.opened > 0) + { + // We're in an implicit line joining, ignore the current NEWLINE token + this.HideAndAddPendingToken(this.curToken); + return; + } + + var nlToken = new CommonToken(this.curToken); // save the current NEWLINE token + var isLookingAhead = this.ffgToken.Type == PythonLexer.WS; + if (isLookingAhead) + { + this.SetCurrentAndFollowingTokens(); // set the next two tokens + } + + switch (this.ffgToken.Type) + { + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment + this.HideAndAddPendingToken(nlToken); + if (isLookingAhead) + { + this.AddPendingToken(this.curToken); // WS token + } + break; + default: + this.AddPendingToken(nlToken); + if (isLookingAhead) + { // We're on a whitespace(s) followed by a statement + var indentationLength = this.ffgToken.Type == TokenConstants.EOF ? + 0 : + this.GetIndentationLength(this.curToken.Text); + + if (indentationLength != PythonLexerBase.INVALID_LENGTH) + { + this.AddPendingToken(this.curToken); // WS token + this.InsertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + } + else + { + this.ReportError("inconsistent use of tabs and spaces in indentation"); + } + } + else + { + // We're at a newline followed by a statement (there is no whitespace before the statement) + this.InsertIndentOrDedentToken(0); // may insert DEDENT token(s) + } + break; + } + } + + private void InsertIndentOrDedentToken(int indentLength) + { + var prevIndentLength = this.indentLengthStack.Peek(); + if (indentLength > prevIndentLength) + { + this.CreateAndAddPendingToken(PythonLexer.INDENT, null, this.ffgToken); + this.indentLengthStack.Push(indentLength); + return; + } + + while (indentLength < prevIndentLength) + { // more than 1 DEDENT token may be inserted into the token stream + this.indentLengthStack.Pop(); + prevIndentLength = this.indentLengthStack.Peek(); + if (indentLength <= prevIndentLength) + { + this.CreateAndAddPendingToken(PythonLexer.DEDENT, null, this.ffgToken); + } + else + { + this.ReportError("inconsistent dedent"); + } + } + } + + private void CheckCurToken() + { + switch (this.curToken.Type) + { + case PythonLexer.FSTRING_START: + this.curISTRING_MIDDLEtokenType = PythonLexer.FSTRING_MIDDLE; + this.SetLexerModeByISTRING_STARTtoken(); + return; + case PythonLexer.TSTRING_START: + this.curISTRING_MIDDLEtokenType = PythonLexer.TSTRING_MIDDLE; + this.SetLexerModeByISTRING_STARTtoken(); + return; + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + this.HandleISTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field + switch (this.curToken.Type) + { + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + return; // No curToken exchange happened + } + break; + case PythonLexer.FSTRING_END: + case PythonLexer.TSTRING_END: + this.PopLexerMode(); + return; + default: + if (this.lexerModeStack.Count == 0) + { + return; // Not in f/t-string mode + } + break; + } + this.ProcessBraceExpression(); + } + + private void ProcessBraceExpression() + { + switch (this.curToken.Type) // the following tokens can only come from default mode (after an LBRACE in f/t-string) + { + case PythonLexer.NEWLINE: + // append the current brace expression with the current newline + this.AppendToBraceExpression(this.curToken.Text); + var nlToken = new CommonToken(this.curToken); + nlToken.Channel = TokenConstants.HiddenChannel; + this.curToken = nlToken; + break; + case PythonLexer.LBRACE: + // the outermost brace expression cannot be a dictionary comprehension or a set comprehension + this.braceExpressionStack.Push("{"); + this.paren_or_bracket_openedStack.Push(0); + this.PushLexerMode(Lexer.DEFAULT_MODE); + break; + case PythonLexer.LPAR: + case PythonLexer.LSQB: + // append the current brace expression with a "(" or a "[" + this.AppendToBraceExpression(this.curToken.Text); + // https://peps.python.org/pep-0498/#lambdas-inside-expressions + this.IncrementBraceStack(); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + // append the current brace expression with a ")" or a "]" + this.AppendToBraceExpression(this.curToken.Text); + this.DecrementBraceStack(); + break; + case PythonLexer.COLON: + case PythonLexer.COLONEQUAL: + // append the current brace expression with a ":" or a ":=" + this.AppendToBraceExpression(this.curToken.Text); + this.SetLexerModeByCOLONorCOLONEQUALtoken(); + break; + case PythonLexer.RBRACE: + this.SetLexerModeAfterRBRACEtoken(); + break; + default: + // append the current brace expression with the current token text + this.AppendToBraceExpression(this.curToken.Text); + break; + } + } + + private void AppendToBraceExpression(string text) + { + var top = this.braceExpressionStack.Pop(); + this.braceExpressionStack.Push(top + text); + } + + private void IncrementBraceStack() + { // increment the last element + var top = this.paren_or_bracket_openedStack.Pop(); + this.paren_or_bracket_openedStack.Push(top + 1); + } + + private void DecrementBraceStack() + { // decrement the last element + var top = this.paren_or_bracket_openedStack.Pop(); + this.paren_or_bracket_openedStack.Push(top - 1); + } + + private void SetLexerModeAfterRBRACEtoken() + { + switch (this.curLexerMode) + { + case Lexer.DEFAULT_MODE: + this.PopLexerMode(); + this.PopByBRACE(); + break; + case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.PopLexerMode(); + this.PopLexerMode(); + this.PopByBRACE(); + break; + default: + this.ReportLexerError("f-string: single '}' is not allowed"); + break; + } + } + + private void SetLexerModeByISTRING_STARTtoken() // ISTRING = interpolated string (FSTRING or TSTRING) + { + if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.Count == 0) + { + PythonLexerBase.InitLexerModesForIStringStart(); + } + + var interpolatedStringPrefix = this.curToken.Text.ToLower(); + if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.TryGetValue(interpolatedStringPrefix, out int newLexerMode)) + { + this.PushLexerMode(newLexerMode); + } + else + { + this.ReportLexerError($"internal error: unknown interpolated string literal prefix: {this.curToken.Text}"); + } + } + + private static void InitLexerModesForIStringStart() + { + // f-strings + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["f'"] = PythonLexer.SQ1__FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rf'"] = PythonLexer.SQ1R_FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["fr'"] = PythonLexer.SQ1R_FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["f\""] = PythonLexer.DQ1__FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rf\""] = PythonLexer.DQ1R_FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["fr\""] = PythonLexer.DQ1R_FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["f'''"] = PythonLexer.SQ3__FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rf'''"] = PythonLexer.SQ3R_FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["fr'''"] = PythonLexer.SQ3R_FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["f\"\"\""] = PythonLexer.DQ3__FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rf\"\"\""] = PythonLexer.DQ3R_FSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["fr\"\"\""] = PythonLexer.DQ3R_FSTRING_MODE; + + // t-strings + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["t'"] = PythonLexer.SQ1__TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rt'"] = PythonLexer.SQ1R_TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["tr'"] = PythonLexer.SQ1R_TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["t\""] = PythonLexer.DQ1__TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rt\""] = PythonLexer.DQ1R_TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["tr\""] = PythonLexer.DQ1R_TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["t'''"] = PythonLexer.SQ3__TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rt'''"] = PythonLexer.SQ3R_TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["tr'''"] = PythonLexer.SQ3R_TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["t\"\"\""] = PythonLexer.DQ3__TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rt\"\"\""] = PythonLexer.DQ3R_TSTRING_MODE; + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["tr\"\"\""] = PythonLexer.DQ3R_TSTRING_MODE; + } + + private void SetLexerModeByCOLONorCOLONEQUALtoken() + { + // Exit early when the current lexer mode indicates an open parenthesis/bracket + if (this.paren_or_bracket_openedStack.Peek() != 0) + { + return; + } + + // COLONEQUAL token will be replaced with a COLON token in CheckNextToken() + var prevLexerMode = this.lexerModeStack.Peek(); + switch (prevLexerMode) // check the previous lexer mode (the current is DEFAULT_MODE) + { + case PythonLexer.SQ1__FSTRING_MODE: + case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.SQ1__TSTRING_MODE: + case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.SQ1R_FSTRING_MODE: + case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.SQ1R_TSTRING_MODE: + case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.DQ1__FSTRING_MODE: + case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.DQ1__TSTRING_MODE: + case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.DQ1R_FSTRING_MODE: + case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.DQ1R_TSTRING_MODE: + case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.SQ3__FSTRING_MODE: + case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.SQ3__TSTRING_MODE: + case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.SQ3R_FSTRING_MODE: + case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.SQ3R_TSTRING_MODE: + case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.DQ3__FSTRING_MODE: + case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.DQ3__TSTRING_MODE: + case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.DQ3R_FSTRING_MODE: + case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + + case PythonLexer.DQ3R_TSTRING_MODE: + case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.PushLexerMode(PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + } + } + + private void PopByBRACE() + { + this.paren_or_bracket_openedStack.Pop(); + var curBraceExpression = this.braceExpressionStack.Pop(); + this.prevBraceExpression = curBraceExpression + "}"; + if (this.braceExpressionStack.Count > 0) + { + // Extend the current brace expression by adding the previous expression + curBraceExpression = this.braceExpressionStack.Pop(); + this.braceExpressionStack.Push(curBraceExpression + this.prevBraceExpression); + } + } + + private void HandleISTRING_MIDDLEtokenWithDoubleBrace() // ISTRING = interpolated string (FSTRING or TSTRING) + { + // replace the trailing double brace with a single brace and insert a hidden brace token + var lastTwoChars = this.GetLastTwoCharsOfTheCurTokenText(); + switch (lastTwoChars) + { + case "{{": + this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", TokenConstants.HiddenChannel); + break; + case "}}": + this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", TokenConstants.HiddenChannel); + break; + } + } + + private void HandleISTRING_MIDDLEtokenWithQuoteAndLBrace() // ISTRING = interpolated string (FSTRING or TSTRING) + { + // replace the trailing quote + left_brace with a quote and insert an LBRACE token + // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token + var lastTwoChars = this.GetLastTwoCharsOfTheCurTokenText(); + switch (lastTwoChars) + { + case "\"{": + case "'{": + case "\\{": + this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", TokenConstants.DefaultChannel); + break; + } + } + + private string GetLastTwoCharsOfTheCurTokenText() + { + var text = this.curToken.Text; + return text.Length >= 2 ? text[^2..] : text; + + } + + private void TrimLastCharAddPendingTokenSetCurToken(int type, string text, int channel) + { + // trim the last char and add the modified curToken to the pendingTokens stack + var curTokenText = this.curToken.Text; + var tokenTextWithoutLastChar = curTokenText[..^1]; + var token = new CommonToken(this.curToken); + token.Text = tokenTextWithoutLastChar; + token.StopIndex -= 1; + this.AddPendingToken(token); + + this.CreateNewCurToken(type, text, channel); // set curToken + } + + private void HandleCOLONEQUALtokenInIString() // ISTRING = interpolated string (FSTRING or TSTRING) + { + if (this.lexerModeStack.Count > 0 && + this.paren_or_bracket_openedStack.Peek() == 0) + { + // In an f/t-string, the walrus operator (:=) is only allowed inside parentheses. + // If used outside, split the COLONEQUAL token into a COLON + // (used as a format specifier instead of a walrus operator), + // and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE). + var colonequalToken = new CommonToken(this.curToken); + colonequalToken.Type = PythonLexer.COLON; + colonequalToken.Text = ":"; + colonequalToken.StopIndex = colonequalToken.StartIndex; + this.curToken = colonequalToken; + + switch (this.ffgToken.Type) + { + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + colonequalToken = new CommonToken(this.ffgToken); + colonequalToken.Text = "=" + colonequalToken.Text; + colonequalToken.StartIndex -= 1; + colonequalToken.Column -= 1; + this.ffgToken = colonequalToken; + break; + default: + this.AddPendingToken(this.curToken); + this.CreateNewCurToken(this.curISTRING_MIDDLEtokenType, "=", TokenConstants.DefaultChannel); + break; + } + } + this.AddPendingToken(this.curToken); + } + + private void CreateNewCurToken(int type, string text, int channel) + { + var token = new CommonToken(this.curToken); + token.Type = type; + token.Text = text; + token.Channel = channel; + token.Column += 1; + token.StartIndex += 1; + token.StopIndex = token.StartIndex; + this.curToken = token; + } + + private void PushLexerMode(int mode) + { + this.PushMode(mode); + this.lexerModeStack.Push(this.curLexerMode); + this.curLexerMode = mode; + } + + private void PopLexerMode() + { + this.PopMode(); + this.curLexerMode = this.lexerModeStack.Pop(); + } + + private void HandleFORMAT_SPECIFICATION_MODE() + { + if (this.lexerModeStack.Count == 0 || this.ffgToken.Type != PythonLexer.RBRACE) + { + return; + } + + // insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification + switch (this.curToken.Type) + { + case PythonLexer.COLON: + this.CreateAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken); + break; + case PythonLexer.RBRACE: + // only when the previous brace expression is not a dictionary comprehension or set comprehension + if (!IsValid_DictionaryOrSet_ComprehensionExpression(this.prevBraceExpression)) + { + this.CreateAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken); + } + break; + } + } + + private static bool IsValid_DictionaryOrSet_ComprehensionExpression(string code) + { + var inputStream = CharStreams.fromString(code); + var lexer = new PythonLexer(inputStream); + var tokenStream = new CommonTokenStream(lexer); + var parser = new PythonParser(tokenStream); + + // Disable error listeners to suppress console output + lexer.RemoveErrorListeners(); + parser.RemoveErrorListeners(); + + parser.dictcomp(); // Try parsing as dictionary comprehension + if (parser.NumberOfSyntaxErrors == 0) + return true; + + parser = new PythonParser(tokenStream); + tokenStream.Seek(0); + parser.RemoveErrorListeners(); + parser.setcomp(); // Try parsing as set comprehension + return parser.NumberOfSyntaxErrors == 0; + } + + private void InsertTrailingTokens() + { + switch (this.lastPendingTokenTypeFromDefaultChannel) + { + case PythonLexer.NEWLINE: + case PythonLexer.DEDENT: + break; // no trailing NEWLINE token is needed + default: + // insert an extra trailing NEWLINE token that serves as the end of the last statement + this.CreateAndAddPendingToken(PythonLexer.NEWLINE, null, this.ffgToken); // ffgToken is EOF + break; + } + this.InsertIndentOrDedentToken(0); // Now insert as many trailing DEDENT tokens as needed + } + + private void HandleEOFtoken() + { + if (this.lastPendingTokenTypeFromDefaultChannel > 0) + { // there was a statement in the intStream (leading NEWLINE tokens are hidden) + this.InsertTrailingTokens(); + } + this.AddPendingToken(this.curToken); + } + + private void HideAndAddPendingToken(IToken originalToken) + { + var token = new CommonToken(originalToken); + token.Channel = TokenConstants.HiddenChannel; + this.AddPendingToken(token); + } + + private void CreateAndAddPendingToken(int ttype, string? text, IToken originalToken) + { + var token = new CommonToken(originalToken); + token.Type = ttype; + token.Channel = TokenConstants.DefaultChannel; + token.StopIndex = originalToken.StartIndex - 1; + token.Text = text ?? "<" + this.Vocabulary.GetSymbolicName(ttype) + ">"; + + this.AddPendingToken(token); + } + + private void AddPendingToken(IToken token) + { + // save the last pending token type because the pendingTokens list can be empty by the nextToken() + this.previousPendingTokenType = token.Type; + if (token.Channel == TokenConstants.DefaultChannel) + { + this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; + } + this.pendingTokens.AddLast(token); + } + + private int GetIndentationLength(string indentText) // the indentText may contain spaces, tabs or form feeds + { + var length = 0; + foreach (char ch in indentText) + { + switch (ch) + { + case ' ': + this.wasSpaceIndentation = true; + length += 1; + break; + case '\t': + this.wasTabIndentation = true; + length += PythonLexerBase.TAB_LENGTH - (length % PythonLexerBase.TAB_LENGTH); + break; + case '\f': // form feed + length = 0; + break; + } + } + + if (this.wasTabIndentation && this.wasSpaceIndentation) + { + if (!this.wasIndentationMixedWithSpacesAndTabs) + { + this.wasIndentationMixedWithSpacesAndTabs = true; + length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent + } + } + return length; + } + + private void ReportLexerError(string errMsg) + { + this.ErrorListenerDispatch.SyntaxError(this.ErrorOutput, this, this.curToken.Type, this.curToken.Line, this.curToken.Column, " LEXER" + PythonLexerBase.ERR_TXT + errMsg, null); + } + + private void ReportError(string errMsg) + { + this.ReportLexerError(errMsg); + this.CreateAndAddPendingToken(PythonLexer.ERRORTOKEN, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken); + // the ERRORTOKEN also triggers a parser error + } +} diff --git a/python/python3_14/Java/PythonLexerBase.java b/python/python3_14/Java/PythonLexerBase.java new file mode 100644 index 0000000000..ef8ccf7c12 --- /dev/null +++ b/python/python3_14/Java/PythonLexerBase.java @@ -0,0 +1,768 @@ +/* +The MIT License (MIT) +Copyright (c) 2021 Robert Einhorn + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + */ + +/* + * + * Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation, + * interpolated strings, and encoding declaration. + * + * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com + * + */ + +// **** Implemented in Java 8 for compatibility with ANTLR4 Java runtime **** + +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.HashMap; +import java.util.Map; + +import org.antlr.v4.runtime.*; +import org.antlr.v4.runtime.misc.Pair; + +public abstract class PythonLexerBase extends Lexer { + private static final Map LEXER_MODES_FOR_ISTRING_START = new HashMap<>(); + + private static final int INVALID_LENGTH = -1; + private static final String ERR_TXT = " ERROR: "; + private static final int TAB_LENGTH = 8; + + private String encodingName; + + // Indentation handling + private Deque indentLengthStack; + private Deque pendingTokens; + + private int previousPendingTokenType; + private int lastPendingTokenTypeFromDefaultChannel; + + // Parenthesis / bracket / brace counts + private int opened; + private Deque parenOrBracketOpenedStack; + private Deque braceExpressionStack; + private String prevBraceExpression; + + // Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE) + private int curISTRING_MIDDLEtokenType; + + // We reimplement mode/stack because not all runtimes expose _mode/_modeStack + private int curLexerMode; + private Deque lexerModeStack; + + // Indentation diagnostics + private boolean wasSpaceIndentation; + private boolean wasTabIndentation; + private boolean wasIndentationMixedWithSpacesAndTabs; + + // Current / lookahead tokens + private Token curToken; + private Token ffgToken; + + protected PythonLexerBase(CharStream input) { + super(input); + this.init(); + } + + @Override + public void reset() { + this.init(); + super.reset(); + } + + private void init() { + this.encodingName = ""; + this.indentLengthStack = new ArrayDeque<>(); + this.pendingTokens = new ArrayDeque<>(); + this.previousPendingTokenType = 0; + this.lastPendingTokenTypeFromDefaultChannel = 0; + this.opened = 0; + this.parenOrBracketOpenedStack = new ArrayDeque<>(); + this.braceExpressionStack = new ArrayDeque<>(); + this.prevBraceExpression = ""; + this.curISTRING_MIDDLEtokenType = 0; + this.curLexerMode = Lexer.DEFAULT_MODE; + this.lexerModeStack = new ArrayDeque<>(); + this.wasSpaceIndentation = false; + this.wasTabIndentation = false; + this.wasIndentationMixedWithSpacesAndTabs = false; + this.curToken = null; + this.ffgToken = null; + } + + /** + * Sets the encoding name to emit an ENCODING token at the start of the token stream. + * Leave empty if not needed (e.g., when parsing from string). + * + * @param encodingName the encoding name (e.g., "utf-8"), or empty string to disable ENCODING token + */ + public void setEncodingName(final String encodingName) { + this.encodingName = encodingName; + } + + @Override + public Token nextToken() { // Reading the input stream until EOF is reached + this.checkNextToken(); + return this.pendingTokens.pollFirst(); // Add the queued token to the token stream + } + + private void checkNextToken() { + if (this.previousPendingTokenType == Token.EOF) return; + + this.setCurrentAndFollowingTokens(); + if (this.indentLengthStack.isEmpty()) { // We're at the first token + this.handleStartOfInput(); + } + + switch (this.curToken.getType()) { + case PythonLexer.NEWLINE: + this.handleNEWLINEtoken(); + break; + case PythonLexer.LPAR: + case PythonLexer.LSQB: + case PythonLexer.LBRACE: + this.opened++; + this.addPendingToken(this.curToken); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + case PythonLexer.RBRACE: + this.opened--; + this.addPendingToken(this.curToken); + break; + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + this.handleISTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field + this.addPendingToken(this.curToken); + break; + case PythonLexer.COLONEQUAL: + this.handleCOLONEQUALtokenInIString(); + break; + case PythonLexer.ERRORTOKEN: + this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'"); + this.addPendingToken(this.curToken); + break; + case Token.EOF: + this.handleEOFtoken(); + break; + default: + this.addPendingToken(this.curToken); + } + this.handleFORMAT_SPECIFICATION_MODE(); + } + + private void setCurrentAndFollowingTokens() { + this.curToken = this.ffgToken == null ? + super.nextToken() : + this.ffgToken; + + this.checkCurToken(); // Do not use ffgToken in this method or any of its submethods — it hasn't been set yet! + + this.ffgToken = this.curToken.getType() == Token.EOF ? + this.curToken : + super.nextToken(); + } + + // - initialize indent stack + // - skip BOM token + // - insert ENCODING token (if any) + // - hide leading NEWLINE(s) + // - insert leading INDENT if first statement is indented + private void handleStartOfInput() { + this.indentLengthStack.push(0); // this will never be popped off + + if (this.curToken.getType() == PythonLexer.BOM) { + this.setCurrentAndFollowingTokens(); + } + this.insertENCODINGtoken(); + + while (this.curToken.getType() != Token.EOF) { + if (this.curToken.getChannel() == Token.DEFAULT_CHANNEL) { + if (this.curToken.getType() == PythonLexer.NEWLINE) { + // all the NEWLINE tokens must be ignored before the first statement + this.hideAndAddPendingToken(this.curToken); + } else { // We're at the first statement + this.insertLeadingIndentToken(); + return; // continue the processing of the current token with checkNextToken() + } + } else { + this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + } + this.setCurrentAndFollowingTokens(); + } + // continue the processing of the EOF token with checkNextToken() + } + + private void insertENCODINGtoken() { // https://peps.python.org/pep-0263/ + if (this.encodingName.isEmpty()) return; + + final Pair sourcePair = this._tokenFactorySourcePair; + final CommonToken encodingToken = new CommonToken(sourcePair, PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, 0, 0); + encodingToken.setText(this.encodingName); + encodingToken.setLine(0); + encodingToken.setCharPositionInLine(-1); + this.addPendingToken(encodingToken); + } + + private void insertLeadingIndentToken() { + if (this.previousPendingTokenType == PythonLexer.WS) { + Token prevToken = this.pendingTokens.peekLast(); // WS token + if (this.getIndentationLength(prevToken.getText()) != 0) { // there is an "indentation" before the first statement + final String errMsg = "first statement indented"; + this.reportLexerError(errMsg); + // insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser + this.createAndAddPendingToken(PythonLexer.INDENT, ERR_TXT + errMsg, this.curToken); + } + } + } + + private void handleNEWLINEtoken() { + if (!this.lexerModeStack.isEmpty()) { // for multi line f/t-string literals + this.addPendingToken(this.curToken); + return; + } + + if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token + this.hideAndAddPendingToken(this.curToken); + return; + } + + final Token nlToken = new CommonToken(this.curToken); // save the current NEWLINE token + final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS; + if (isLookingAhead) { + this.setCurrentAndFollowingTokens(); // set the next two tokens + } + + switch (this.ffgToken.getType()) { + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment + this.hideAndAddPendingToken(nlToken); + if (isLookingAhead) { + this.addPendingToken(this.curToken); // WS token + } + break; + default: + this.addPendingToken(nlToken); + if (isLookingAhead) { // We're on a whitespace(s) followed by a statement + final int indentationLength = this.ffgToken.getType() == Token.EOF ? + 0 : + this.getIndentationLength(this.curToken.getText()); + + if (indentationLength != PythonLexerBase.INVALID_LENGTH) { + this.addPendingToken(this.curToken); // WS token + this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + } else { + this.reportError("inconsistent use of tabs and spaces in indentation"); + } + } else { // We're at a newline followed by a statement (there is no whitespace before the statement) + this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) + } + } + } + + private void insertIndentOrDedentToken(final int indentLength) { + int prevIndentLength = this.indentLengthStack.peek(); + if (indentLength > prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.INDENT, null, this.ffgToken); + this.indentLengthStack.push(indentLength); + return; + } + + while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream + this.indentLengthStack.pop(); + prevIndentLength = this.indentLengthStack.peek(); + if (indentLength <= prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.DEDENT, null, this.ffgToken); + } else { + this.reportError("inconsistent dedent"); + } + } + } + + private void checkCurToken() { + switch (this.curToken.getType()) { + case PythonLexer.FSTRING_START: + this.curISTRING_MIDDLEtokenType = PythonLexer.FSTRING_MIDDLE; + this.setLexerModeByISTRING_STARTtoken(); + return; + case PythonLexer.TSTRING_START: + this.curISTRING_MIDDLEtokenType = PythonLexer.TSTRING_MIDDLE; + this.setLexerModeByISTRING_STARTtoken(); + return; + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + this.handleISTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field + switch (this.curToken.getType()) { + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + return; // No curToken exchange happened + } + break; + case PythonLexer.FSTRING_END: + case PythonLexer.TSTRING_END: + this.popLexerMode(); + return; + default: + if (this.lexerModeStack.isEmpty()) { + return; // Not in f/t-string mode + } + } + this.processBraceExpression(); + } + + private void processBraceExpression() { + switch (this.curToken.getType()) { // the following tokens can only come from default mode (after an LBRACE in f/t-string) + case PythonLexer.NEWLINE: + // append the current brace expression with the current newline + this.appendToBraceExpression(this.curToken.getText()); + final CommonToken nlToken = new CommonToken(this.curToken); + nlToken.setChannel(Token.HIDDEN_CHANNEL); + this.curToken = nlToken; + break; + case PythonLexer.LBRACE: + // the outermost brace expression cannot be a dictionary comprehension or a set comprehension + this.braceExpressionStack.push("{"); + this.parenOrBracketOpenedStack.push(0); + this.pushLexerMode(Lexer.DEFAULT_MODE); + break; + case PythonLexer.LPAR: + case PythonLexer.LSQB: + // append the current brace expression with a "(" or a "[" + this.appendToBraceExpression(this.curToken.getText()); + // https://peps.python.org/pep-0498/#lambdas-inside-expressions + this.incrementBraceStack(); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + // append the current brace expression with a ")" or a "]" + this.appendToBraceExpression(this.curToken.getText()); + this.decrementBraceStack(); + break; + case PythonLexer.COLON: + case PythonLexer.COLONEQUAL: + // append the current brace expression with a ":" or a ":=" + this.appendToBraceExpression(this.curToken.getText()); + this.setLexerModeByCOLONorCOLONEQUALtoken(); + break; + case PythonLexer.RBRACE: + this.setLexerModeAfterRBRACEtoken(); + break; + default: + // append the current brace expression with the current token text + this.appendToBraceExpression(this.curToken.getText()); + } + } + + private void appendToBraceExpression(final String text) { + final String top = this.braceExpressionStack.pop(); + this.braceExpressionStack.push(top + text); + } + + private void incrementBraceStack() { // increment the last element + this.parenOrBracketOpenedStack.push(this.parenOrBracketOpenedStack.pop() + 1); + } + + private void decrementBraceStack() { // decrement the last element + this.parenOrBracketOpenedStack.push(this.parenOrBracketOpenedStack.pop() - 1); + } + + private void setLexerModeAfterRBRACEtoken() { + switch (this.curLexerMode) { + case Lexer.DEFAULT_MODE: + this.popLexerMode(); + this.popByBRACE(); + break; + case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.popLexerMode(); + this.popLexerMode(); + this.popByBRACE(); + break; + default: + this.reportLexerError("f-string: single '}' is not allowed"); + } + } + + private void setLexerModeByISTRING_STARTtoken() { // ISTRING = interpolated string (FSTRING or TSTRING) + if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.isEmpty()) { + PythonLexerBase.initLexerModesForIStringStart(); + } + + final String interpolatedStringPrefix = this.curToken.getText().toLowerCase(); + final Integer newLexerMode = PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.get(interpolatedStringPrefix); + if (newLexerMode != null) { + this.pushLexerMode(newLexerMode); + } else { + this.reportLexerError("internal error: unknown interpolated string literal prefix: " + this.curToken.getText()); + } + } + + private static void initLexerModesForIStringStart() { + // f-strings + LEXER_MODES_FOR_ISTRING_START.put("f'", PythonLexer.SQ1__FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("rf'", PythonLexer.SQ1R_FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("fr'", PythonLexer.SQ1R_FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("f\"", PythonLexer.DQ1__FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("rf\"", PythonLexer.DQ1R_FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("fr\"", PythonLexer.DQ1R_FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("f'''", PythonLexer.SQ3__FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("rf'''", PythonLexer.SQ3R_FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("fr'''", PythonLexer.SQ3R_FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("f\"\"\"", PythonLexer.DQ3__FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE); + + // t-strings + LEXER_MODES_FOR_ISTRING_START.put("t'", PythonLexer.SQ1__TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("rt'", PythonLexer.SQ1R_TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("tr'", PythonLexer.SQ1R_TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("t\"", PythonLexer.DQ1__TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("rt\"", PythonLexer.DQ1R_TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("tr\"", PythonLexer.DQ1R_TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("t'''", PythonLexer.SQ3__TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("rt'''", PythonLexer.SQ3R_TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("tr'''", PythonLexer.SQ3R_TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("t\"\"\"", PythonLexer.DQ3__TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("rt\"\"\"", PythonLexer.DQ3R_TSTRING_MODE); + LEXER_MODES_FOR_ISTRING_START.put("tr\"\"\"", PythonLexer.DQ3R_TSTRING_MODE); + } + + private void setLexerModeByCOLONorCOLONEQUALtoken() { + // Exit early when the current lexer mode indicates an open parenthesis/bracket + if (this.parenOrBracketOpenedStack.peek() != 0) { + return; + } + + // COLONEQUAL token will be replaced with a COLON token in checkNextToken() + final int prevLexerMode = lexerModeStack.peek(); + switch (prevLexerMode) { // check the previous lexer mode (the current is DEFAULT_MODE) + case PythonLexer.SQ1__FSTRING_MODE: + case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.SQ1__TSTRING_MODE: + case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.SQ1R_FSTRING_MODE: + case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.SQ1R_TSTRING_MODE: + case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.DQ1__FSTRING_MODE: + case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.DQ1__TSTRING_MODE: + case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.DQ1R_FSTRING_MODE: + case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.DQ1R_TSTRING_MODE: + case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.SQ3__FSTRING_MODE: + case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.SQ3__TSTRING_MODE: + case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.SQ3R_FSTRING_MODE: + case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.SQ3R_TSTRING_MODE: + case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.DQ3__FSTRING_MODE: + case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.DQ3__TSTRING_MODE: + case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.DQ3R_FSTRING_MODE: + case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer.DQ3R_TSTRING_MODE: + case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + } + } + + private void popByBRACE() { + this.parenOrBracketOpenedStack.pop(); + String curBraceExpression = this.braceExpressionStack.pop(); + this.prevBraceExpression = curBraceExpression + "}"; + if (!this.braceExpressionStack.isEmpty()) { + // Extend the current brace expression by adding the previous expression + curBraceExpression = this.braceExpressionStack.pop(); + this.braceExpressionStack.push(curBraceExpression + this.prevBraceExpression); + } + } + + private void handleISTRING_MIDDLEtokenWithDoubleBrace() { // ISTRING = interpolated string (FSTRING or TSTRING) + // replace the trailing double brace with a single brace and insert a hidden brace token + final String lastTwoChars = this.getLastTwoCharsOfTheCurTokenText(); + switch (lastTwoChars) { + case "{{": + this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL); + break; + case "}}": + this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL); + break; + } + } + + private void handleISTRING_MIDDLEtokenWithQuoteAndLBrace() { // ISTRING = interpolated string (FSTRING or TSTRING) + // replace the trailing quote + left_brace with a quote and insert an LBRACE token + // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token + final String lastTwoChars = this.getLastTwoCharsOfTheCurTokenText(); + switch (lastTwoChars) { + case "\"{": + case "'{": + case "\\{": + this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL); + break; + } + } + + private String getLastTwoCharsOfTheCurTokenText() { + final String text = this.curToken.getText(); + return text.length() >= 2 ? text.substring(text.length() - 2) : text; + } + + private void trimLastCharAddPendingTokenSetCurToken(final int type, final String text, final int channel) { + // trim the last char and add the modified curToken to the pendingTokens stack + final String curTokenText = this.curToken.getText(); + final String tokenTextWithoutLastChar = curTokenText.substring(0, curTokenText.length() - 1); + final CommonToken token = new CommonToken(this.curToken); + token.setText(tokenTextWithoutLastChar); + token.setStopIndex(token.getStopIndex() - 1); + this.addPendingToken(token); + + this.createNewCurToken(type, text, channel); // set curToken + } + + private void handleCOLONEQUALtokenInIString() { // ISTRING = interpolated string (FSTRING or TSTRING) + if (!this.lexerModeStack.isEmpty() && + this.parenOrBracketOpenedStack.peek() == 0) { + + // In an f/t-string, the walrus operator (:=) is only allowed inside parentheses. + // If used outside, split the COLONEQUAL token into a COLON + // (used as a format specifier instead of a walrus operator), + // and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE). + CommonToken colonequalToken = new CommonToken(this.curToken); + colonequalToken.setType(PythonLexer.COLON); + colonequalToken.setText(":"); + colonequalToken.setStopIndex(colonequalToken.getStartIndex()); + this.curToken = colonequalToken; + + switch (this.ffgToken.getType()) { + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + colonequalToken = new CommonToken(this.ffgToken); + colonequalToken.setText("=" + colonequalToken.getText()); + colonequalToken.setStartIndex(colonequalToken.getStartIndex() - 1); + colonequalToken.setCharPositionInLine(colonequalToken.getCharPositionInLine() - 1); + this.ffgToken = colonequalToken; + break; + default: + this.addPendingToken(this.curToken); + this.createNewCurToken(this.curISTRING_MIDDLEtokenType, "=", Token.DEFAULT_CHANNEL); + } + } + this.addPendingToken(this.curToken); + } + + private void createNewCurToken(final int type, final String text, final int channel) { + final CommonToken token = new CommonToken(this.curToken); + token.setType(type); + token.setText(text); + token.setChannel(channel); + token.setCharPositionInLine(token.getCharPositionInLine() + 1); + token.setStartIndex(token.getStartIndex() + 1); + token.setStopIndex(token.getStartIndex()); + this.curToken = token; + } + + private void pushLexerMode(final int mode) { + this.pushMode(mode); + this.lexerModeStack.push(this.curLexerMode); + this.curLexerMode = mode; + } + + private void popLexerMode() { + this.popMode(); + this.curLexerMode = this.lexerModeStack.pop(); + } + + private void handleFORMAT_SPECIFICATION_MODE() { + if (this.lexerModeStack.isEmpty() || this.ffgToken.getType() != PythonLexer.RBRACE) { + return; + } + + // insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification + switch (this.curToken.getType()) { + case PythonLexer.COLON: + this.createAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken); + break; + case PythonLexer.RBRACE: + // only when the previous brace expression is not a dictionary comprehension or set comprehension + if (!isValid_DictionaryOrSet_ComprehensionExpression(this.prevBraceExpression)) { + this.createAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken); + } + break; + default: + break; + } + } + + private boolean isValid_DictionaryOrSet_ComprehensionExpression(final String code) { + final CharStream inputStream = CharStreams.fromString(code); + final PythonLexer lexer = new PythonLexer(inputStream); + final CommonTokenStream tokenStream = new CommonTokenStream(lexer); + PythonParser parser = new PythonParser(tokenStream); + + // Disable error listeners to suppress console output + lexer.removeErrorListeners(); + parser.removeErrorListeners(); + + parser.dictcomp(); // Try parsing as dictionary comprehension + if (parser.getNumberOfSyntaxErrors() == 0) + return true; + + parser = new PythonParser(tokenStream); + tokenStream.seek(0); + parser.removeErrorListeners(); + parser.setcomp(); // Try parsing as set comprehension + return parser.getNumberOfSyntaxErrors() == 0; + } + + private void insertTrailingTokens() { + switch (this.lastPendingTokenTypeFromDefaultChannel) { + case PythonLexer.NEWLINE: + case PythonLexer.DEDENT: + break; // no trailing NEWLINE token is needed + default: // insert an extra trailing NEWLINE token that serves as the end of the last statement + this.createAndAddPendingToken(PythonLexer.NEWLINE, null, this.ffgToken); // ffgToken is EOF + } + this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed + } + + private void handleEOFtoken() { + if (this.lastPendingTokenTypeFromDefaultChannel > 0) { + // there was a statement in the input (leading NEWLINE tokens are hidden) + this.insertTrailingTokens(); + } + this.addPendingToken(this.curToken); + } + + private void hideAndAddPendingToken(final Token originalToken) { + final CommonToken token = new CommonToken(originalToken); + token.setChannel(Token.HIDDEN_CHANNEL); + this.addPendingToken(token); + } + + private void createAndAddPendingToken(final int tokenType, final String text, final Token originalToken) { + final CommonToken token = new CommonToken(originalToken); + token.setType(tokenType); + token.setChannel(Token.DEFAULT_CHANNEL); + token.setStopIndex(originalToken.getStartIndex() - 1); + token.setText(text == null ? + "<" + this.getVocabulary().getSymbolicName(tokenType) + ">" : + text); + + this.addPendingToken(token); + } + + private void addPendingToken(final Token token) { + // save the last pending token type because the pendingTokens list can be empty by the nextToken() + this.previousPendingTokenType = token.getType(); + if (token.getChannel() == Token.DEFAULT_CHANNEL) { + this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; + } + this.pendingTokens.addLast(token); + } + + private int getIndentationLength(final String indentText) { // the indentText may contain spaces, tabs or form feeds + int length = 0; + for (char ch : indentText.toCharArray()) { + switch (ch) { + case ' ': + this.wasSpaceIndentation = true; + length += 1; + break; + case '\t': + this.wasTabIndentation = true; + length += PythonLexerBase.TAB_LENGTH - (length % PythonLexerBase.TAB_LENGTH); + break; + case '\f': // form feed + length = 0; + break; + } + } + + if (this.wasTabIndentation && this.wasSpaceIndentation) { + if (!(this.wasIndentationMixedWithSpacesAndTabs)) { + this.wasIndentationMixedWithSpacesAndTabs = true; + length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent + } + } + return length; + } + + private void reportLexerError(final String errMsg) { + this.getErrorListenerDispatch().syntaxError(this, this.curToken.getType(), this.curToken.getLine(), this.curToken.getCharPositionInLine(), " LEXER" + ERR_TXT + errMsg, null); + } + + private void reportError(final String errMsg) { + this.reportLexerError(errMsg); + this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, ERR_TXT + errMsg, this.ffgToken); + // the ERRORTOKEN also triggers a parser error + } +} diff --git a/python/python3_14/JavaScript/PythonLexerBase.js b/python/python3_14/JavaScript/PythonLexerBase.js new file mode 100644 index 0000000000..1aed61866d --- /dev/null +++ b/python/python3_14/JavaScript/PythonLexerBase.js @@ -0,0 +1,778 @@ +/* +The MIT License (MIT) +Copyright (c) 2021 Robert Einhorn + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + */ + +/* + * + * Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation, + * interpolated strings, and encoding declaration. + * + * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com + * + */ + +import { CharStreams, CommonTokenStream, Token, CommonToken, Lexer } from "antlr4"; +import PythonLexer from "./PythonLexer.js"; +import PythonParser from "./PythonParser.js"; + +export default class PythonLexerBase extends Lexer { + static #LEXER_MODES_FOR_ISTRING_START = new Map(); + static #INVALID_LENGTH = -1; + static #ERR_TXT = " ERROR: "; + static #TAB_LENGTH = 8; + + #encodingName; + + // Indentation handling + #indentLengthStack; + #pendingTokens; + + #previousPendingTokenType; + #lastPendingTokenTypeFromDefaultChannel; + + // Parenthesis / bracket / brace counts + #opened; + #paren_or_bracket_openedStack; + #braceExpressionStack; + #prevBraceExpression; + + // Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE) + #curISTRING_MIDDLEtokenType;; + + // We reimplement mode/stack because not all runtimes expose _mode/_modeStack + #curLexerMode; + #lexerModeStack; + + // Indentation diagnostics + #wasSpaceIndentation; + #wasTabIndentation; + #wasIndentationMixedWithSpacesAndTabs; + + // Current / lookahead tokens + #curToken; + #ffgToken; + + constructor(input) { + super(input); + this.#init(); + } + + reset() { + this.#init(); + super.reset(); + } + + #init() { + this.#encodingName = ""; + this.#indentLengthStack = []; + this.#pendingTokens = []; + this.#previousPendingTokenType = 0; + this.#lastPendingTokenTypeFromDefaultChannel = 0; + this.#opened = 0; + this.#paren_or_bracket_openedStack = []; + this.#braceExpressionStack = []; + this.#prevBraceExpression = ""; + this.#curISTRING_MIDDLEtokenType = 0; + this.#curLexerMode = Lexer.DEFAULT_MODE; + this.#lexerModeStack = []; + this.#wasSpaceIndentation = false; + this.#wasTabIndentation = false; + this.#wasIndentationMixedWithSpacesAndTabs = false; + this.#curToken = null; + this.#ffgToken = null; + } + + /** + * Sets the encoding name to emit an ENCODING token at the start of the token stream. + * Leave empty if not needed (e.g., when parsing from string). + * + * @param {string} encodingName - The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token. + */ + setEncodingName(encodingName) { + this.encodingName = encodingName; + } + + nextToken() { // Reading the input stream until EOF is reached + this.#checkNextToken(); + return this.#pendingTokens.shift() /* stack pollFirst() */; // Add the queued token to the token stream + } + + #checkNextToken() { + if (this.#previousPendingTokenType === Token.EOF) { + return; + } + + this.#setCurrentAndFollowingTokens(); + if (this.#indentLengthStack.length === 0) { // We're at the first token + this.#handleStartOfInput(); + } + + switch (this.#curToken.type) { + case PythonLexer.NEWLINE: + this.#handleNEWLINEtoken(); + break; + case PythonLexer.LPAR: + case PythonLexer.LSQB: + case PythonLexer.LBRACE: + this.#opened++; + this.#addPendingToken(this.#curToken); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + case PythonLexer.RBRACE: + this.#opened--; + this.#addPendingToken(this.#curToken); + break; + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + this.#handleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field + this.#addPendingToken(this.#curToken); + break; + case PythonLexer.COLONEQUAL: + this.#handleCOLONEQUALtokenInIString(); + break; + case PythonLexer.ERRORTOKEN: + this.#reportLexerError(`token recognition error at: '${this.#curToken.text}'`); + this.#addPendingToken(this.#curToken); + break; + case Token.EOF: + this.#handleEOFtoken(); + break; + default: + this.#addPendingToken(this.#curToken); + } + this.#handleFORMAT_SPECIFICATION_MODE(); + } + + #setCurrentAndFollowingTokens() { + this.#curToken = this.#ffgToken == undefined ? + super.nextToken() : + this.#ffgToken; + + this.#checkCurToken(); // Do not use ffgToken in this method or any of its submethods — it hasn't been set yet! + + this.#ffgToken = this.#curToken.type === Token.EOF ? + this.#curToken : + super.nextToken(); + } + + // - initialize indent stack + // - skip BOM token + // - insert ENCODING token (if any) + // - hide leading NEWLINE(s) + // - insert leading INDENT if first statement is indented + #handleStartOfInput() { + // initialize the stack with a default 0 indentation length + this.#indentLengthStack.push(0); // this will never be popped off + + if (this.#curToken.type === PythonLexer.BOM) { + this.#setCurrentAndFollowingTokens(); + } + + this.#insertENCODINGtoken(); + + while (this.#curToken.type !== Token.EOF) { + if (this.#curToken.channel === Token.DEFAULT_CHANNEL) { + if (this.#curToken.type === PythonLexer.NEWLINE) { + // all the NEWLINE tokens must be ignored before the first statement + this.#hideAndAddPendingToken(this.#curToken); + } else { // We're at the first statement + this.#insertLeadingIndentToken(); + return; // continue the processing of the current token with #checkNextToken() + } + } else { + this.#addPendingToken(this.#curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + } + this.#setCurrentAndFollowingTokens(); + } // continue the processing of the EOF token with #checkNextToken() + } + + #insertENCODINGtoken() { + if (this.#encodingName === "") return + + const sourcePair = [this, this._input]; + const encodingToken = new CommonToken(sourcePair, PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, /*start*/ 0, /*stop*/ 0); + encodingToken.text = this.#encodingName; + encodingToken.line = 0; + encodingToken.column = -1; + this.#addPendingToken(encodingToken); + } + + #insertLeadingIndentToken() { + if (this.#previousPendingTokenType === PythonLexer.WS) { + const prevToken = this.#pendingTokens.at(-1); /* stack peek */ // WS token + if (this.#getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement + const errMsg = "first statement indented"; + this.#reportLexerError(errMsg); + // insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser + this.#createAndAddPendingToken(PythonLexer.INDENT, PythonLexerBase.#ERR_TXT + errMsg, this.#curToken); + } + } + } + + #handleNEWLINEtoken() { + if (this.#lexerModeStack.length > 0) { // for multi line f/t-string literals + this.#addPendingToken(this.#curToken); + return; + } + + if (this.#opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token + this.#hideAndAddPendingToken(this.#curToken); + return; + } + + const nlToken = this.#curToken.clone(); // save the current NEWLINE token + const isLookingAhead = this.#ffgToken.type === PythonLexer.WS; + if (isLookingAhead) { + this.#setCurrentAndFollowingTokens(); // set the next two tokens + } + + switch (this.#ffgToken.type) { + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment + this.#hideAndAddPendingToken(nlToken); + if (isLookingAhead) { + this.#addPendingToken(this.#curToken); // WS token + } + break; + default: + this.#addPendingToken(nlToken); + if (isLookingAhead) { // We're on a whitespace(s) followed by a statement + const indentationLength = this.#ffgToken.type === Token.EOF ? + 0 : + this.#getIndentationLength(this.#curToken.text); + + if (indentationLength !== PythonLexerBase.#INVALID_LENGTH) { + this.#addPendingToken(this.#curToken); // WS token + this.#insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + } else { + this.#reportError("inconsistent use of tabs and spaces in indentation"); + } + } else { // We're at a newline followed by a statement (there is no whitespace before the statement) + this.#insertIndentOrDedentToken(0); // may insert DEDENT token(s) + } + } + } + + #insertIndentOrDedentToken(curIndentLength) { + let prevIndentLength = this.#indentLengthStack.at(-1) /* stack peek */; + if (curIndentLength > prevIndentLength) { + this.#createAndAddPendingToken(PythonLexer.INDENT, null, this.#ffgToken); + this.#indentLengthStack.push(curIndentLength); + return; + } + + while (curIndentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream + this.#indentLengthStack.pop(); + prevIndentLength = this.#indentLengthStack.at(-1) /* stack peek */; + if (curIndentLength <= prevIndentLength) { + this.#createAndAddPendingToken(PythonLexer.DEDENT, null, this.#ffgToken); + } else { + this.#reportError("inconsistent dedent"); + } + } + } + + #checkCurToken() { + switch (this.#curToken.type) { + case PythonLexer.FSTRING_START: + this.#curISTRING_MIDDLEtokenType = PythonLexer.FSTRING_MIDDLE; + this.#setLexerModeByISTRING_STARTtoken(); + return; + case PythonLexer.TSTRING_START: + this.#curISTRING_MIDDLEtokenType = PythonLexer.TSTRING_MIDDLE; + this.#setLexerModeByISTRING_STARTtoken(); + return; + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + this.#handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field + switch (this.#curToken.type) { + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + return; // No curToken exchange happened + } + break; + case PythonLexer.FSTRING_END: + case PythonLexer.TSTRING_END: + this.#popLexerMode(); + return; + default: + if (this.#lexerModeStack.length === 0) { + return; // Not in fstring mode + } + + } + this.#processBraceExpression(); + } + + #processBraceExpression() { + switch (this.#curToken.type) { // the following tokens can only come from default mode (after an LBRACE in f/t-string) + case PythonLexer.NEWLINE: + // append the current brace expression with the current newline + this.#appendToBraceExpression(this.#curToken.text) + this.#curToken.channel = Token.HIDDEN_CHANNEL; + break; + case PythonLexer.LBRACE: + // the outermost brace expression cannot be a dictionary comprehension or a set comprehension + this.#braceExpressionStack.push("{"); + this.#paren_or_bracket_openedStack.push(0); + this.#pushLexerMode(Lexer.DEFAULT_MODE); + break; + case PythonLexer.LPAR: + case PythonLexer.LSQB: + // append the current brace expression with a "(" or a "[" + this.#appendToBraceExpression(this.#curToken.text) + // https://peps.python.org/pep-0498/#lambdas-inside-expressions + this.#incrementBraceStack(); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + // append the current brace expression with a ")" or a "]" + this.#appendToBraceExpression(this.#curToken.text) + this.#decrementBraceStack(); + break; + case PythonLexer.COLON: + case PythonLexer.COLONEQUAL: + // append the current brace expression with a ":" or a ":=" + this.#appendToBraceExpression(this.#curToken.text) + this.#setLexerModeByCOLONorCOLONEQUALtoken(); + break; + case PythonLexer.RBRACE: + this.#setLexerModeAfterRBRACEtoken(); + break; + default: + // append the current brace expression with the current token text + this.#appendToBraceExpression(this.#curToken.text) + } + } + + #appendToBraceExpression(text) { + const lastIndex = this.#braceExpressionStack.length - 1; + this.#braceExpressionStack[lastIndex] += text; + } + + #incrementBraceStack() { // increment the last element (stack peek + 1) + const lastIndex = this.#paren_or_bracket_openedStack.length - 1; + this.#paren_or_bracket_openedStack[lastIndex]++; + } + + #decrementBraceStack() { // decrement the last element (stack peek - 1) + const lastIndex = this.#paren_or_bracket_openedStack.length - 1; + this.#paren_or_bracket_openedStack[lastIndex]--; + } + + #setLexerModeAfterRBRACEtoken() { + switch (this.#curLexerMode) { + case Lexer.DEFAULT_MODE: + this.#popLexerMode(); + this.#popByBRACE(); + break; + case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.#popLexerMode(); + this.#popLexerMode(); + this.#popByBRACE(); + break; + default: + this.#reportLexerError("f-string: single '}' is not allowed"); + } + } + + #setLexerModeByISTRING_STARTtoken() { // ISTRING = interpolated string (FSTRING or TSTRING) + if (PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.size === 0) { + PythonLexerBase.#initLexerModesForIStringStart(); + } + + const interpolatedStringPrefix = this.#curToken.text.toLowerCase(); + if (PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.has(interpolatedStringPrefix)) { + const newLexerMode = PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.get(interpolatedStringPrefix); + this.#pushLexerMode(newLexerMode); + } else { + this.#reportLexerError( + "internal error: unknown interpolated string literal prefix: " + this.#curToken.text + ); + } + } + + static #initLexerModesForIStringStart() { + // f-strings + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("f'", PythonLexer.SQ1__FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rf'", PythonLexer.SQ1R_FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("fr'", PythonLexer.SQ1R_FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("f\"", PythonLexer.DQ1__FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rf\"", PythonLexer.DQ1R_FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("fr\"", PythonLexer.DQ1R_FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("f'''", PythonLexer.SQ3__FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rf'''", PythonLexer.SQ3R_FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("fr'''", PythonLexer.SQ3R_FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("f\"\"\"", PythonLexer.DQ3__FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE); + + // t-strings + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("t'", PythonLexer.SQ1__TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rt'", PythonLexer.SQ1R_TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("tr'", PythonLexer.SQ1R_TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("t\"", PythonLexer.DQ1__TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rt\"", PythonLexer.DQ1R_TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("tr\"", PythonLexer.DQ1R_TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("t'''", PythonLexer.SQ3__TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rt'''", PythonLexer.SQ3R_TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("tr'''", PythonLexer.SQ3R_TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("t\"\"\"", PythonLexer.DQ3__TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rt\"\"\"", PythonLexer.DQ3R_TSTRING_MODE); + PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("tr\"\"\"", PythonLexer.DQ3R_TSTRING_MODE); + } + + #setLexerModeByCOLONorCOLONEQUALtoken() { + // Exit early when the current lexer mode indicates an open parenthesis/bracket + const opened = this.#paren_or_bracket_openedStack.at(-1) > 0; /* stack peek */ + if (opened) { + return; + } + + // COLONEQUAL token will be replaced with a COLON token in CheckNextToken() + const prevLexerMode = this.#lexerModeStack.at(-1); /* stack peek */ + switch (prevLexerMode) { + case PythonLexer.SQ1__FSTRING_MODE: + case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ1__TSTRING_MODE: + case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ1R_FSTRING_MODE: + case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ1R_TSTRING_MODE: + case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ1__FSTRING_MODE: + case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ1__TSTRING_MODE: + case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ1R_FSTRING_MODE: + case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ1R_TSTRING_MODE: + case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ3__FSTRING_MODE: + case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ3__TSTRING_MODE: + case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ3R_FSTRING_MODE: + case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ3R_TSTRING_MODE: + case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ3__FSTRING_MODE: + case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ3__TSTRING_MODE: + case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ3R_FSTRING_MODE: + case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ3R_TSTRING_MODE: + case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.#pushLexerMode(PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); + break; + } + } + + #popByBRACE() { + this.#paren_or_bracket_openedStack.pop(); + const curBraceExpression = this.#braceExpressionStack.pop(); + this.#prevBraceExpression = curBraceExpression + "}"; + if (this.#braceExpressionStack.length > 0) { + // Extend the current brace expression by adding the previous expression + const lastIndex = this.#braceExpressionStack.length - 1; + this.#braceExpressionStack[lastIndex] += this.#prevBraceExpression; + } + } + + #handleFSTRING_MIDDLEtokenWithDoubleBrace() { // ISTRING = interpolated string (FSTRING or TSTRING) + // replace the trailing double brace with a single brace and insert a hidden brace token + const lastTwoChars = this.#getLastTwoCharsOfTheCurTokenText(); + switch (lastTwoChars) { + case "{{": + this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL); + break; + case "}}": + this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL); + break; + } + } + + #handleFSTRING_MIDDLEtokenWithQuoteAndLBrace() { // ISTRING = interpolated string (FSTRING or TSTRING) + // replace the trailing quote + left_brace with a quote and insert an LBRACE token + // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token + const lastTwoChars = this.#getLastTwoCharsOfTheCurTokenText(); + switch (lastTwoChars) { + case "\"{": + case "'{": + case "\\{": + this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL); + break; + } + } + + #getLastTwoCharsOfTheCurTokenText() { + const text = this.#curToken.text; + return text.length <= 2 ? text : text.slice(-2); + } + + #trimLastCharAddPendingTokenSetCurToken(type, text, channel) { + // trim the last char and add the modified curToken to the pendingTokens stack + const tokenTextWithoutLastChar = this.#curToken.text.slice(0, -1); + this.#curToken.text = tokenTextWithoutLastChar; + this.#curToken.stop -= 1; + this.#addPendingToken(this.#curToken); + + this.#createNewCurToken(type, text, channel); // set curToken + } + + #handleCOLONEQUALtokenInIString() { // ISTRING = interpolated string (FSTRING or TSTRING) + if (this.#lexerModeStack.length > 0 && + this.#paren_or_bracket_openedStack.at(-1) === 0) { // stack peek === 0 + + // In an f/t-string, the walrus operator (:=) is only allowed inside parentheses. + // If used outside, split the COLONEQUAL token into a COLON + // (used as a format specifier instead of a walrus operator), + // and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE). + this.#curToken.type = PythonLexer.COLON; + this.#curToken.text = ":"; + this.#curToken.stop = this.#curToken.start; + + switch (this.#ffgToken.type) { + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: { + const token = this.#ffgToken.clone(); + token.text = "=" + token.text; + token.start -= 1; + token.column -= 1; + this.#ffgToken = token; + break; + } + default: { + this.#addPendingToken(this.#curToken); + this.#createNewCurToken(this.#curISTRING_MIDDLEtokenType, "=", Token.DEFAULT_CHANNEL); + } + } + } + this.#addPendingToken(this.#curToken); + } + + #createNewCurToken(type, text, channel) { + const token = this.#curToken.clone(); + token.type = type; + token.text = text; + token.channel = channel; + token.column += 1; + token.start += 1; + token.stop = token.start; + this.#curToken = token; + } + + #pushLexerMode(mode) { + this.pushMode(mode); + this.#lexerModeStack.push(this.#curLexerMode); + this.#curLexerMode = mode; + } + + #popLexerMode() { + this.popMode(); + this.#curLexerMode = this.#lexerModeStack.pop(); + } + + #handleFORMAT_SPECIFICATION_MODE() { + if (this.#lexerModeStack.length == 0 || this.#ffgToken.type !== PythonLexer.RBRACE) { + return; + } + + // insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification + switch (this.#curToken.type) { + case PythonLexer.COLON: + this.#createAndAddPendingToken(this.#curISTRING_MIDDLEtokenType, "", this.#ffgToken); + break; + case PythonLexer.RBRACE: + // only when the previous brace expression is not a dictionary comprehension or set comprehension + if (!this.#isValid_DictionaryOrSet_ComprehensionExpression(this.#prevBraceExpression)) { + this.#createAndAddPendingToken(this.#curISTRING_MIDDLEtokenType, "", this.#ffgToken); + } + break; + } + } + + #isValid_DictionaryOrSet_ComprehensionExpression(code) { + const inputStream = CharStreams.fromString(code); + const lexer = new PythonLexer(inputStream); + const tokenStream = new CommonTokenStream(lexer); + let parser = new PythonParser(tokenStream); + + // Disable error listeners to suppress console output + lexer.removeErrorListeners(); + parser.removeErrorListeners(); + + parser.dictcomp(); // Try parsing as dictionary comprehension + if (parser.syntaxErrorsCount === 0) + return true; + + parser = new PythonParser(tokenStream); + tokenStream.seek(0); + parser.removeErrorListeners(); + parser.setcomp(); // Try parsing as set comprehension + return parser.syntaxErrorsCount === 0; + } + + #insertTrailingTokens() { + switch (this.#lastPendingTokenTypeFromDefaultChannel) { + case PythonLexer.NEWLINE: + case PythonLexer.DEDENT: + break; // no trailing NEWLINE token is needed + default: + // insert an extra trailing NEWLINE token that serves as the end of the last statement + this.#createAndAddPendingToken(PythonLexer.NEWLINE, null, this.#ffgToken); // ffgToken is EOF + } + this.#insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed + } + + #handleEOFtoken() { + if (this.#lastPendingTokenTypeFromDefaultChannel > 0) { + // there was a statement in the input (leading NEWLINE tokens are hidden) + this.#insertTrailingTokens(); + } + this.#addPendingToken(this.#curToken); + } + + #hideAndAddPendingToken(originalToken) { + originalToken.channel = Token.HIDDEN_CHANNEL; + this.#addPendingToken(originalToken); + } + + #createAndAddPendingToken(type, text, originalToken) { + const token = originalToken.clone(); + token.type = type; + token.channel = Token.DEFAULT_CHANNEL; + token.stop = originalToken.start - 1; + token.text = text == null ? + `<${PythonLexer.symbolicNames[type] ?? ""}>` : + text; + + this.#addPendingToken(token); + } + + #addPendingToken(token) { + // save the last pending token type because the pendingTokens linked list can be empty by the nextToken() + this.#previousPendingTokenType = token.type; + if (token.channel === Token.DEFAULT_CHANNEL) { + this.#lastPendingTokenTypeFromDefaultChannel = this.#previousPendingTokenType; + } + this.#pendingTokens.push(token) /* .addLast(token) */; + } + + #getIndentationLength(indentText) { // the indentText may contain spaces, tabs or form feeds + let length = 0; + for (let ch of indentText) { + switch (ch) { + case " ": + this.#wasSpaceIndentation = true; + length += 1; + break; + case "\t": + this.#wasTabIndentation = true; + length += PythonLexerBase.#TAB_LENGTH - (length % PythonLexerBase.#TAB_LENGTH); + break; + case "\f": // form feed + length = 0; + break; + } + } + + if (this.#wasTabIndentation && this.#wasSpaceIndentation) { + if (!this.#wasIndentationMixedWithSpacesAndTabs) { + this.#wasIndentationMixedWithSpacesAndTabs = true; + length = PythonLexerBase.#INVALID_LENGTH; // only for the first inconsistent indent + } + } + return length; + } + + #reportLexerError(errMsg) { + this.getErrorListener().syntaxError(this, this.#curToken.type, this.#curToken.line, this.#curToken.column, " LEXER" + PythonLexerBase.#ERR_TXT + errMsg, null); + } + + #reportError(errMsg) { + this.#reportLexerError(errMsg); + + this.#createAndAddPendingToken(PythonLexer.ERRORTOKEN, PythonLexerBase.#ERR_TXT + errMsg, this.#ffgToken); + // the ERRORTOKEN also triggers a parser error + } +} diff --git a/python/python3_14/Python3/PythonLexerBase.py b/python/python3_14/Python3/PythonLexerBase.py new file mode 100644 index 0000000000..c4d4026974 --- /dev/null +++ b/python/python3_14/Python3/PythonLexerBase.py @@ -0,0 +1,595 @@ +# The MIT License (MIT) +# Copyright (c) 2021 Robert Einhorn +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +# Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation, +# interpolated strings, and encoding declaration. +# +# Developed by : Robert Einhorn + +from collections import deque +from typing import Literal, TextIO, Optional +from antlr4 import InputStream, Lexer, Token +from antlr4.Token import CommonToken +import PythonLexer +import sys + +INVALID_LENGTH: Literal[-1] = -1 +ERR_TXT: Literal[" ERROR: "] = " ERROR: " +TAB_LENGTH: Literal[8] = 8 + +class PythonLexerBase(Lexer): + _LEXER_MODES_FOR_ISTRING_START: dict[str, int] = {} # static field + + def __init__(self, input: InputStream, output: TextIO = sys.stdout): + super().__init__(input, output) + self._init() + + def reset(self) -> None: + self._init() + super().reset() + + def _init(self) -> None: + self._encodingName: str = "" + + # Indentation handling + self._indent_length_stack: list[int] = [] + self._pending_tokens: deque[CommonToken] = deque() + + self._previous_pending_token_type: int = 0 + self._last_pending_token_type_from_default_channel = 0 + + # Parenthesis / bracket / brace counts + self._opened: int = 0 + self._paren_or_bracket_opened_stack: list[int] = [] + self._brace_expression_stack: list[str] = [] + self._prev_brace_expression: str = "" + + # Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE) + self._cur_ISTRING_MIDDLE_token_type: int = 0 + + # We reimplement mode/stack because not all runtimes expose _mode/_modeStack + self._cur_lexer_mode: int = Lexer.DEFAULT_MODE + self._lexer_mode_stack: list[int] = [] + + # Indentation diagnostics + self._was_space_indentation: bool = False + self._was_tab_indentation: bool = False + self._was_indentation_mixed_with_spaces_and_tabs: bool = False + + # Current / lookahead tokens + self._cur_token: CommonToken = None + self._ffg_token: CommonToken = None + + def set_encoding_name(self, encoding_name: str) -> None: + """ + Sets the encoding name to emit an ENCODING token at the start of the token stream. + Leave empty if not needed (e.g., when parsing from string). + + :param encoding_name: The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token. + """ + self.encoding_name = encoding_name + + def nextToken(self) -> CommonToken: # Reading the input stream until EOF is reached + self._check_next_token() + return self._pending_tokens.popleft() # Add the queued token to the token stream + + def _check_next_token(self) -> None: + if self._previous_pending_token_type == Token.EOF: + return + + self._set_current_and_following_tokens() + if not self._indent_length_stack: # We're at the first token + self._handle_start_of_input() + + match self._cur_token.type: + case self.NEWLINE: + self._handle_NEWLINE_token() + case self.LPAR | self.LSQB | self.LBRACE: + self._opened += 1 + self._add_pending_token(self._cur_token) + case self.RPAR | self.RSQB | self.RBRACE: + self._opened -= 1 + self._add_pending_token(self._cur_token) + case self.FSTRING_MIDDLE | self.TSTRING_MIDDLE: + self._handle_ISTRING_MIDDLE_token_with_double_brace() # does not affect the opened field + self._add_pending_token(self._cur_token) + case self.COLONEQUAL: + self._handle_COLONEQUAL_token_in_istring() + case self.ERRORTOKEN: + self._report_lexer_error("token recognition error at: '" + self._cur_token.text + "'") + self._add_pending_token(self._cur_token) + case Token.EOF: + self._handle_EOF_token() + case _: + self._add_pending_token(self._cur_token) + self._handle_FORMAT_SPECIFICATION_MODE() + + def _set_current_and_following_tokens(self) -> None: + self._cur_token = super().nextToken() if self._ffg_token is None else \ + self._ffg_token + + self._check_cur_token() # Do not use ffgToken in this method or any of its submethods — it hasn't been set yet! + + self._ffg_token = self._cur_token if self._cur_token.type == Token.EOF else \ + super().nextToken() + + # - initialize indent stack + # - skip BOM token + # - insert ENCODING token (if any) + # - hide leading NEWLINE(s) + # - insert leading INDENT if first statement is indented + def _handle_start_of_input(self) -> None: + # initialize the stack with a default 0 indentation length + self._indent_length_stack.append(0) # this will never be popped off + + if self._cur_token.type == self.BOM: + self._set_current_and_following_tokens() + self._insert_ENCODING_token() + + while self._cur_token.type != Token.EOF: + if self._cur_token.channel == Token.DEFAULT_CHANNEL: + if self._cur_token.type == self.NEWLINE: + # all the NEWLINE tokens must be ignored before the first statement + self._hide_and_add_pending_token(self._cur_token) + else: # We're at the first statement + self._insert_leading_indent_token() + return # continue the processing of the current token with _check_next_token() + else: + self._add_pending_token(self._cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + self._set_current_and_following_tokens() + # continue the processing of the EOF token with _check_next_token() + + def _insert_ENCODING_token(self) -> None: # https://peps.python.org/pep-0263/ + if not self._encodingName: + return + + source_pair = self._tokenFactorySourcePair + encoding_token: CommonToken = CommonToken(source_pair, self.ENCODING, Token.HIDDEN_CHANNEL, start = 0, stop = 0) + encoding_token.text = self._encodingName + encoding_token.line = 0 + encoding_token.column = -1 + self._add_pending_token(encoding_token) + + def _insert_leading_indent_token(self) -> None: + if self._previous_pending_token_type == self.WS: + prev_token: CommonToken = self._pending_tokens[-1] # stack peek, WS token + if self._get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement + err_msg: str = "first statement indented" + self._report_lexer_error(err_msg) + # insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser + self._create_and_add_pending_token(self.INDENT, ERR_TXT + err_msg, self._cur_token) + + def _handle_NEWLINE_token(self) -> None: + if self._lexer_mode_stack: # for multi line f/t-string literals + self._add_pending_token(self._cur_token) + return + + if self._opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token + self._hide_and_add_pending_token(self._cur_token) + return + + nl_token: CommonToken = self._cur_token.clone() # save the current NEWLINE token + is_looking_ahead: bool = self._ffg_token.type == self.WS + if is_looking_ahead: + self._set_current_and_following_tokens() # set the next two tokens + + match self._ffg_token.type: + case self.NEWLINE | self.COMMENT: + # We're before a blank line or a comment or type comment or a type ignore comment + self._hide_and_add_pending_token(nl_token) # ignore the NEWLINE token + if is_looking_ahead: + self._add_pending_token(self._cur_token) # WS token + case _: + self._add_pending_token(nl_token) + if is_looking_ahead: # We're on a whitespace(s) followed by a statement + indentation_length: int = 0 if self._ffg_token.type == Token.EOF else \ + self._get_indentation_length(self._cur_token.text) + + if indentation_length != INVALID_LENGTH: + self._add_pending_token(self._cur_token) # WS token + self._insert_INDENT_or_DEDENT_token(indentation_length) # may insert INDENT token or DEDENT token(s) + else: + self._report_error("inconsistent use of tabs and spaces in indentation") + else: # We're at a newline followed by a statement (there is no whitespace before the statement) + self._insert_INDENT_or_DEDENT_token(0) # may insert DEDENT token(s) + + def _insert_INDENT_or_DEDENT_token(self, indent_length: int) -> None: + prev_indent_length: int = self._indent_length_stack[-1] # stack peek + if indent_length > prev_indent_length: + self._create_and_add_pending_token(self.INDENT, None, self._ffg_token) + self._indent_length_stack.append(indent_length) # stack push + return + + while indent_length < prev_indent_length: # more than 1 DEDENT token may be inserted to the token stream + self._indent_length_stack.pop() + prev_indent_length = self._indent_length_stack[-1] # stack peek + if indent_length <= prev_indent_length: + self._create_and_add_pending_token(self.DEDENT, None, self._ffg_token) + else: + self._report_error("inconsistent dedent") + + def _check_cur_token(self) -> None: + match self._cur_token.type: + case self.FSTRING_START: + self._cur_ISTRING_MIDDLE_token_type = self.FSTRING_MIDDLE + self._set_lexer_mode_by_ISTRING_START_token() + return + case self.TSTRING_START: + self._cur_ISTRING_MIDDLE_token_type = self.TSTRING_MIDDLE + self._set_lexer_mode_by_ISTRING_START_token() + return + case self.FSTRING_MIDDLE | self.TSTRING_MIDDLE: + self._handle_ISTRING_MIDDLE_token_with_quote_and_lbrace() # affect the opened field + match self._cur_token.type: + case self.FSTRING_MIDDLE | self.TSTRING_MIDDLE: + return # No _cur_token exchange happened + case self.FSTRING_END | self.TSTRING_END: + self._pop_lexer_mode() + return + case _: + if not self._lexer_mode_stack: + return # Not in fstring mode + self._process_brace_expression() + + def _process_brace_expression(self) -> None: + match self._cur_token.type: # the following tokens can only come from default mode (after an LBRACE in f/t-string) + case self.NEWLINE: + # append the current brace expression with the current newline + self._append_to_brace_expression(self._cur_token.text) + self._cur_token.channel = Token.HIDDEN_CHANNEL + case self.LBRACE: + # the outermost brace expression cannot be a dictionary comprehension or a set comprehension + self._brace_expression_stack.append("{") + self._paren_or_bracket_opened_stack.append(0) # stack push + self._push_lexer_mode(Lexer.DEFAULT_MODE) + case self.LPAR | self.LSQB: + # append the current brace expression with a "(" or a "[" + self._append_to_brace_expression(self._cur_token.text) + # https://peps.python.org/pep-0498/#lambdas-inside-expressions + self._increment_brace_stack() + case self.RPAR | self.RSQB: + # append the current brace expression with a ")" or a "]" + self._append_to_brace_expression(self._cur_token.text) + self._decrement_brace_stack() + case self.COLON | self.COLONEQUAL: + # append the current brace expression with a ":" or a ":=" + self._append_to_brace_expression(self._cur_token.text) + self._set_lexer_mode_by_COLON_or_COLONEQUAL_token() + case self.RBRACE: + self._set_lexer_mode_after_RBRACE_token() + case _: + # append the current brace expression with the current token text + self._append_to_brace_expression(self._cur_token.text) + + def _append_to_brace_expression(self, text: str) -> None: + self._brace_expression_stack[-1] += text + + def _increment_brace_stack(self) -> None: # increment the last element (stack peek + 1) + self._paren_or_bracket_opened_stack[-1] += 1 + + def _decrement_brace_stack(self) -> None: # decrement the last element (stack peek - 1) + self._paren_or_bracket_opened_stack[-1] -= 1 + + def _set_lexer_mode_after_RBRACE_token(self) -> None: + match self._cur_lexer_mode: + case Lexer.DEFAULT_MODE: + self._pop_lexer_mode() # only once + self._pop_by_RBRACE() + case ( self.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE + | self.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE + | self.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE + | self.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE + | self.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE + | self.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE + | self.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE + | self.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE + | self.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE + | self.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE + | self.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE + | self.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE + | self.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE + | self.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE + | self.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE + | self.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE): + + self._pop_lexer_mode() + self._pop_lexer_mode() + self._pop_by_RBRACE() + case _: + self._report_lexer_error("f-string: single '}' is not allowed") + + def _set_lexer_mode_by_ISTRING_START_token(self) -> None: + # ISTRING = interpolated string (FSTRING or TSTRING) + if not PythonLexerBase._LEXER_MODES_FOR_ISTRING_START: + PythonLexerBase._init_lexer_modes_for_istring_start() + + interpolated_string_prefix: str = self._cur_token.text.lower() + if interpolated_string_prefix in PythonLexerBase._LEXER_MODES_FOR_ISTRING_START: + new_lexer_mode: int = PythonLexerBase._LEXER_MODES_FOR_ISTRING_START[interpolated_string_prefix] + self._push_lexer_mode(new_lexer_mode) + else: + self._report_lexer_error( + f"internal error: unknown interpolated string literal prefix: {self._cur_token.text}" + ) + + @staticmethod + def _init_lexer_modes_for_istring_start() -> None: + # f-strings + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["f'"] = PythonLexer.PythonLexer.SQ1__FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["rf'"] = PythonLexer.PythonLexer.SQ1R_FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["fr'"] = PythonLexer.PythonLexer.SQ1R_FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['f"'] = PythonLexer.PythonLexer.DQ1__FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['rf"'] = PythonLexer.PythonLexer.DQ1R_FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['fr"'] = PythonLexer.PythonLexer.DQ1R_FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["f'''"] = PythonLexer.PythonLexer.SQ3__FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["rf'''"] = PythonLexer.PythonLexer.SQ3R_FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["fr'''"] = PythonLexer.PythonLexer.SQ3R_FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['f"""'] = PythonLexer.PythonLexer.DQ3__FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['rf"""'] = PythonLexer.PythonLexer.DQ3R_FSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['fr"""'] = PythonLexer.PythonLexer.DQ3R_FSTRING_MODE + + # t-strings + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["t'"] = PythonLexer.PythonLexer.SQ1__TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["rt'"] = PythonLexer.PythonLexer.SQ1R_TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["tr'"] = PythonLexer.PythonLexer.SQ1R_TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['t"'] = PythonLexer.PythonLexer.DQ1__TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['rt"'] = PythonLexer.PythonLexer.DQ1R_TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['tr"'] = PythonLexer.PythonLexer.DQ1R_TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["t'''"] = PythonLexer.PythonLexer.SQ3__TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["rt'''"] = PythonLexer.PythonLexer.SQ3R_TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["tr'''"] = PythonLexer.PythonLexer.SQ3R_TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['t"""'] = PythonLexer.PythonLexer.DQ3__TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['rt"""'] = PythonLexer.PythonLexer.DQ3R_TSTRING_MODE + PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['tr"""'] = PythonLexer.PythonLexer.DQ3R_TSTRING_MODE + + def _set_lexer_mode_by_COLON_or_COLONEQUAL_token(self) -> None: + # Exit early when the current lexer mode indicates an open parenthesis/bracket + opened: bool = self._paren_or_bracket_opened_stack[-1] > 0 # stack peek + if opened: + return + + # COLONEQUAL token will be replaced with a COLON token in _check_next_token() + prevLexerMode = self._lexer_mode_stack[-1] # stack peek + match prevLexerMode: + case self.SQ1__FSTRING_MODE | self.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE) + + case self.SQ1__TSTRING_MODE | self.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE) + + case self.SQ1R_FSTRING_MODE | self.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE) + + case self.SQ1R_TSTRING_MODE | self.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE) + + case self.DQ1__FSTRING_MODE | self.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE) + + case self.DQ1__TSTRING_MODE | self.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE) + + case self.DQ1R_FSTRING_MODE | self.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE) + + case self.DQ1R_TSTRING_MODE | self.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE) + + case self.SQ3__FSTRING_MODE | self.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE) + case self.SQ3__TSTRING_MODE | self.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE) + + case self.SQ3R_FSTRING_MODE | self.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE) + case self.SQ3R_TSTRING_MODE | self.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE) + + case self.DQ3__FSTRING_MODE | self.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE) + + case self.DQ3__TSTRING_MODE | self.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE) + case self.DQ3R_FSTRING_MODE | self.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE) + + case self.DQ3R_TSTRING_MODE | self.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + self._push_lexer_mode(self.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE) + + def _pop_by_RBRACE(self) -> None: + self._paren_or_bracket_opened_stack.pop() + cur_brace_expression: str = self._brace_expression_stack.pop() + self._prev_brace_expression = cur_brace_expression + "}" + if self._brace_expression_stack: + # Extend the current brace expression by adding the previous expression + self._brace_expression_stack[-1] += self._prev_brace_expression + + def _handle_ISTRING_MIDDLE_token_with_double_brace(self) -> None: + # ISTRING = interpolated string (FSTRING or TSTRING) + last_two_chars: str = self._get_last_two_chars_of_the_cur_token_text() + match last_two_chars: + case "{{": + self._trim_last_char_add_pending_token_set_cur_token(self.LBRACE, "{", Token.HIDDEN_CHANNEL) + case "}}": + self._trim_last_char_add_pending_token_set_cur_token(self.RBRACE, "}", Token.HIDDEN_CHANNEL) + + def _handle_ISTRING_MIDDLE_token_with_quote_and_lbrace(self) -> None: # ISTRING = interpolated string (FSTRING or TSTRING) + # replace the trailing quote + left_brace with a quote and insert an LBRACE token + # replace the trailing backslash + left_brace with a backslash and insert an LBRACE token + last_two_chars: str = self._get_last_two_chars_of_the_cur_token_text() + match last_two_chars: + case "\"{" | "'{" | "\\{": + self._trim_last_char_add_pending_token_set_cur_token(self.LBRACE, "{", Token.DEFAULT_CHANNEL) + + def _get_last_two_chars_of_the_cur_token_text(self) -> str: + text: str = self._cur_token.text + return text[-2:] if len(text) >= 2 else text + + def _trim_last_char_add_pending_token_set_cur_token(self, type: int, text: str, channel: int) -> None: + # trim the last char and add the modified curToken to the _pending_tokens stack + token_text_without_lbrace: str = self._cur_token.text[:-1] + self._cur_token.text = token_text_without_lbrace + self._cur_token.stop -= 1 + self._add_pending_token(self._cur_token) + + self._create_new_cur_token(type, text, channel) # set _cur_token + + def _handle_COLONEQUAL_token_in_istring(self) -> None: # istring = interpolated string (FSTRING or TSTRING) + if self._lexer_mode_stack \ + and self._paren_or_bracket_opened_stack[-1] == 0: # stack peek == 0 + + # In an f/t-string, the walrus operator (:=) is only allowed inside parentheses. + # If used outside, split the COLONEQUAL token into a COLON + # (used as a format specifier instead of a walrus operator), + # and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE). + self._cur_token.type = self.COLON + self._cur_token.text = ":" + self._cur_token.stop = self._cur_token.start + + match self._ffg_token.type: + case self.FSTRING_MIDDLE | self.TSTRING_MIDDLE: + token: CommonToken = self._ffg_token.clone() + token.text = "=" + token.text + token.start -= 1 + token.column -= 1 + self._ffg_token = token + case _: + self._add_pending_token(self._cur_token) + self._create_new_cur_token(self._cur_ISTRING_MIDDLE_token_type, "=", Token.DEFAULT_CHANNEL) + self._add_pending_token(self._cur_token) + + def _create_new_cur_token(self, type: int, text: str, channel: int) -> None: + token: CommonToken = self._cur_token.clone() + token.type = type + token.text = text + token.channel = channel + token.column += 1 + token.start += 1 + token.stop = token.start + self._cur_token = token + + def _push_lexer_mode(self, mode: int) -> None: + self.pushMode(mode) + self._lexer_mode_stack.append(self._cur_lexer_mode) # stack push + self._cur_lexer_mode = mode + + def _pop_lexer_mode(self) -> None: + self.popMode() + self._cur_lexer_mode = self._lexer_mode_stack.pop() + + def _handle_FORMAT_SPECIFICATION_MODE(self) -> None: + if not self._lexer_mode_stack or self._ffg_token.type != self.RBRACE: + return + + # insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification + match self._cur_token.type: + case self.COLON: + self._create_and_add_pending_token(self._cur_ISTRING_MIDDLE_token_type, "", self._ffg_token) + case self.RBRACE: + # only when the previous brace expression is not a dictionary comprehension or set comprehension + if not self._is_valid_dictionary_or_set_comprehension_expression(self._prev_brace_expression): + self._create_and_add_pending_token(self._cur_ISTRING_MIDDLE_token_type, "", self._ffg_token) + + def _is_valid_dictionary_or_set_comprehension_expression(self, code: str) -> bool: + from antlr4 import InputStream, CommonTokenStream + from PythonLexer import PythonLexer + from PythonParser import PythonParser + + input_stream: InputStream = InputStream(code) + lexer: PythonLexer = PythonLexer(input_stream) + token_stream: CommonTokenStream = CommonTokenStream(lexer) + parser: PythonParser = PythonParser(token_stream) + + # Disable error listeners to suppress console output + lexer.removeErrorListeners() + parser.removeErrorListeners() + + parser.dictcomp() # Try parsing as dictionary comprehension + if parser.getNumberOfSyntaxErrors() == 0: + return True + + parser = PythonParser(token_stream) + token_stream.seek(0) + parser.removeErrorListeners() + parser.setcomp() # Try parsing as set comprehension + return parser.getNumberOfSyntaxErrors() == 0 + + def _insert_trailing_tokens(self) -> None: + match self._last_pending_token_type_from_default_channel: + case self.NEWLINE | self.DEDENT: + pass # no trailing NEWLINE token is needed + case _: # insert an extra trailing NEWLINE token that serves as the end of the last statement + self._create_and_add_pending_token(self.NEWLINE, None, self._ffg_token) # _ffg_token is EOF + self._insert_INDENT_or_DEDENT_token(0) # Now insert as much trailing DEDENT tokens as needed + + def _handle_EOF_token(self) -> None: + if self._last_pending_token_type_from_default_channel > 0: + # there was statement in the input (leading NEWLINE tokens are hidden) + self._insert_trailing_tokens() + self._add_pending_token(self._cur_token) + + def _hide_and_add_pending_token(self, original_token: CommonToken) -> None: + original_token.channel = Token.HIDDEN_CHANNEL + self._add_pending_token(original_token) + + def _create_and_add_pending_token(self, ttype: int, text: Optional[str], original_token: CommonToken) -> None: + token: CommonToken = original_token.clone() + token.type = ttype + token.channel = Token.DEFAULT_CHANNEL + token.stop = original_token.start - 1 + token.text = "<" + self.symbolicNames[ttype] + ">" if text is None else \ + text + + self._add_pending_token(token) + + def _add_pending_token(self, token: CommonToken) -> None: + # save the last pending token type because the _pending_tokens list can be empty by the nextToken() + self._previous_pending_token_type = token.type + if token.channel == Token.DEFAULT_CHANNEL: + self._last_pending_token_type_from_default_channel = self._previous_pending_token_type + self._pending_tokens.append(token) + + def _get_indentation_length(self, indentText: str) -> int: # the indentText may contain spaces, tabs or form feeds + length: int = 0 + ch: str + for ch in indentText: + match ch: + case ' ': + self._was_space_indentation = True + length += 1 + case '\t': + self._was_tab_indentation = True + length += PythonLexerBase.TAB_LENGTH - (length % PythonLexerBase.TAB_LENGTH) + case '\f': # form feed + length = 0 + + if self._was_tab_indentation and self._was_space_indentation: + if not self._was_indentation_mixed_with_spaces_and_tabs: + self._was_indentation_mixed_with_spaces_and_tabs = True + length = INVALID_LENGTH # only for the first inconsistent indent + return length + + def _report_lexer_error(self, err_msg: str) -> None: + self.getErrorListenerDispatch().syntaxError(self, self._cur_token.type, self._cur_token.line, self._cur_token.column, " LEXER" + ERR_TXT + err_msg, None) + + def _report_error(self, err_msg: str) -> None: + self._report_lexer_error(err_msg) + + self._create_and_add_pending_token(self.ERRORTOKEN, ERR_TXT + err_msg, self._ffg_token) + # the ERRORTOKEN also triggers a parser error diff --git a/python/python3_13/Python3_13_2_official_grammar.peg b/python/python3_14/Python3_14_2_official_grammar.peg similarity index 94% rename from python/python3_13/Python3_13_2_official_grammar.peg rename to python/python3_14/Python3_14_2_official_grammar.peg index e774b1e92b..27990f68f2 100644 --- a/python/python3_13/Python3_13_2_official_grammar.peg +++ b/python/python3_14/Python3_14_2_official_grammar.peg @@ -56,7 +56,7 @@ # ~ # Commit to the current alternative, even if it fails to parse. # &&e -# Eager parse e. The parser will not backtrack and will immediately +# Eager parse e. The parser will not backtrack and will immediately # fail with SyntaxError if e cannot be parsed. # @@ -73,10 +73,15 @@ func_type: '(' [type_expressions] ')' '->' expression NEWLINE* ENDMARKER statements: statement+ -statement: compound_stmt | simple_stmts +statement: + | compound_stmt + | simple_stmts + +single_compound_stmt: + | compound_stmt statement_newline: - | compound_stmt NEWLINE + | single_compound_stmt NEWLINE | simple_stmts | NEWLINE | ENDMARKER @@ -94,12 +99,12 @@ simple_stmt: | return_stmt | import_stmt | raise_stmt - | 'pass' + | pass_stmt | del_stmt | yield_stmt | assert_stmt - | 'break' - | 'continue' + | break_stmt + | continue_stmt | global_stmt | nonlocal_stmt @@ -121,8 +126,8 @@ assignment: | NAME ':' expression ['=' annotated_rhs ] | ('(' single_target ')' | single_subscript_attribute_target) ':' expression ['=' annotated_rhs ] - | (star_targets '=' )+ (yield_expr | star_expressions) !'=' [TYPE_COMMENT] - | single_target augassign ~ (yield_expr | star_expressions) + | (star_targets '=' )+ annotated_rhs !'=' [TYPE_COMMENT] + | single_target augassign ~ annotated_rhs annotated_rhs: yield_expr | star_expressions @@ -148,6 +153,15 @@ raise_stmt: | 'raise' expression ['from' expression ] | 'raise' +pass_stmt: + | 'pass' + +break_stmt: + | 'break' + +continue_stmt: + | 'continue' + global_stmt: 'global' ','.NAME+ nonlocal_stmt: 'nonlocal' ','.NAME+ @@ -179,10 +193,12 @@ import_from_as_names: | ','.import_from_as_name+ import_from_as_name: | NAME ['as' NAME ] + dotted_as_names: | ','.dotted_as_name+ dotted_as_name: | dotted_name ['as' NAME ] + dotted_name: | dotted_name '.' NAME | NAME @@ -334,10 +350,14 @@ try_stmt: # ---------------- except_block: - | 'except' expression ['as' NAME ] ':' block + | 'except' expression ':' block + | 'except' expression 'as' NAME ':' block + | 'except' expressions ':' block | 'except' ':' block except_star_block: - | 'except' '*' expression ['as' NAME ] ':' block + | 'except' '*' expression ':' block + | 'except' '*' expression 'as' NAME ':' block + | 'except' '*' expressions ':' block finally_block: | 'finally' ':' block @@ -495,8 +515,7 @@ type_alias: # Type parameter declaration # -------------------------- -type_params: - | invalid_type_params +type_params: | '[' type_param_seq ']' type_param_seq: ','.type_param+ [','] @@ -742,8 +761,25 @@ fstring_format_spec: fstring: | FSTRING_START fstring_middle* FSTRING_END +tstring_format_spec_replacement_field: + | '{' annotated_rhs '='? [fstring_conversion] [tstring_full_format_spec] '}' +tstring_format_spec: + | TSTRING_MIDDLE + | tstring_format_spec_replacement_field +tstring_full_format_spec: + | ':' tstring_format_spec* +tstring_replacement_field: + | '{' annotated_rhs '='? [fstring_conversion] [tstring_full_format_spec] '}' +tstring_middle: + | tstring_replacement_field + | TSTRING_MIDDLE +tstring: + | TSTRING_START tstring_middle* TSTRING_END + string: STRING -strings: (fstring|string)+ +strings: + | (fstring|string)+ + | tstring+ list: | '[' [star_named_expressions] ']' diff --git a/python/python3_13/PythonLexer.g4 b/python/python3_14/PythonLexer.g4 similarity index 61% rename from python/python3_13/PythonLexer.g4 rename to python/python3_14/PythonLexer.g4 index 98b99d4aef..a6ed067c3c 100644 --- a/python/python3_13/PythonLexer.g4 +++ b/python/python3_14/PythonLexer.g4 @@ -21,35 +21,41 @@ THE SOFTWARE. */ /* - * Project : an ANTLR4 lexer grammar for Python 3 - * https://github.com/RobEin/ANTLR4-parser-for-Python-3.13 + * Project : an ANTLR4 lexer grammar for Python 3 programming language + * https://github.com/RobEin/ANTLR4-parser-for-Python-3.14 * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com */ -// https://docs.python.org/3.13/reference/lexical_analysis.html - +// https://docs.python.org/3.14/reference/lexical_analysis.html lexer grammar PythonLexer; +// the helper class for this grammar that assists in tokenizing indentation, interpolated strings, and the encoding declaration options { superClass=PythonLexerBase; } tokens { - ENCODING // https://docs.python.org/3.13/reference/lexical_analysis.html#encoding-declarations - , INDENT, DEDENT // https://docs.python.org/3.13/reference/lexical_analysis.html#indentation - , TYPE_COMMENT // not supported, only for compatibility with the PythonParser.g4 grammar + ENCODING // https://docs.python.org/3.14/reference/lexical_analysis.html#encoding-declarations + , INDENT, DEDENT // https://docs.python.org/3.14/reference/lexical_analysis.html#indentation + , TYPE_COMMENT // not supported, only for compatibility with the parser grammar , FSTRING_START, FSTRING_MIDDLE, FSTRING_END // https://peps.python.org/pep-0701/#specification + , TSTRING_START, TSTRING_MIDDLE, TSTRING_END // https://peps.python.org/pep-0750/#specification } /* * default lexer mode */ -// https://docs.python.org/3.13/library/token.html#module-token -LPAR : '('; // OPEN_PAREN -LSQB : '['; // OPEN_BRACK -LBRACE : '{'; // OPEN_BRACE -RPAR : ')'; // CLOSE_PAREN -RSQB : ']'; // CLOSE_BRACK -RBRACE : '}'; // CLOSE_BRACE +// https://docs.python.org/3.14/reference/lexical_analysis.html#encoding-declarations +BOM : '\uFEFF'; +// The BOM unicode character indicates that a BOM byte sequence (for Python is only UTF‑8: EF BB BF) was present at the start of the file. +// It is not part of Python source code and is therefore skipped in PythonLexerBase. + +// https://docs.python.org/3.14/library/token.html#module-token +LPAR : '('; +LSQB : '['; +LBRACE : '{'; +RPAR : ')'; +RSQB : ']'; +RBRACE : '}'; DOT : '.'; COLON : ':'; COMMA : ','; @@ -93,7 +99,7 @@ ELLIPSIS : '...'; COLONEQUAL : ':='; EXCLAMATION : '!'; -// https://docs.python.org/3.13/reference/lexical_analysis.html#keywords +// https://docs.python.org/3.14/reference/lexical_analysis.html#keywords FALSE : 'False'; AWAIT : 'await'; ELSE : 'else'; @@ -130,157 +136,296 @@ IF : 'if'; OR : 'or'; YIELD : 'yield'; -// *** Soft Keywords: https://docs.python.org/3.13/reference/lexical_analysis.html#soft-keywords -NAME_OR_TYPE : 'type'; // identifier or type keyword, the parser grammar will decide what it means -NAME_OR_MATCH : 'match'; // identifier or match keyword, the parser grammar will decide what it means -NAME_OR_CASE : 'case'; // identifier or case keyword, the parser grammar will decide what it means -NAME_OR_WILDCARD : '_'; // identifier or wildcard symbol, the parser grammar will decide what it means +// *** Soft Keywords: https://docs.python.org/3.14/reference/lexical_analysis.html#soft-keywords + // the parser grammar determines whether it is an ... +NAME_OR_TYPE : 'type'; // ... identifier or a type keyword, depending on the source code context +NAME_OR_MATCH : 'match'; // ... identifier or a match keyword, depending on the source code context +NAME_OR_CASE : 'case'; // ... identifier or a case keyword, depending on the source code context +NAME_OR_WILDCARD : '_'; // ... identifier or a wildcard symbol, depending on the source code context -// https://docs.python.org/3.13/reference/lexical_analysis.html#identifiers +// https://docs.python.org/3.14/reference/lexical_analysis.html#identifiers NAME : ID_START ID_CONTINUE*; -// https://docs.python.org/3.13/reference/lexical_analysis.html#numeric-literals +// https://docs.python.org/3.14/reference/lexical_analysis.html#numeric-literals NUMBER : INTEGER | FLOAT_NUMBER | IMAG_NUMBER ; -// https://docs.python.org/3.13/reference/lexical_analysis.html#string-and-bytes-literals +// https://docs.python.org/3.14/reference/lexical_analysis.html#string-and-bytes-literals STRING : STRING_LITERAL | BYTES_LITERAL ; -// https://docs.python.org/3.13/reference/lexical_analysis.html#physical-lines +// https://docs.python.org/3.14/reference/lexical_analysis.html#physical-lines NEWLINE : '\r'? '\n'; // Unix, Windows -// https://docs.python.org/3.13/reference/lexical_analysis.html#comments +// https://docs.python.org/3.14/reference/lexical_analysis.html#comments COMMENT : '#' ~[\r\n]* -> channel(HIDDEN); -// https://docs.python.org/3.13/reference/lexical_analysis.html#whitespace-between-tokens +// https://docs.python.org/3.14/reference/lexical_analysis.html#whitespace-between-tokens WS : [ \t\f]+ -> channel(HIDDEN); -// https://docs.python.org/3.13/reference/lexical_analysis.html#explicit-line-joining +// https://docs.python.org/3.14/reference/lexical_analysis.html#explicit-line-joining EXPLICIT_LINE_JOINING : BACKSLASH_NEWLINE -> channel(HIDDEN); +// https://docs.python.org/3.14/reference/lexical_analysis.html#formatted-string-literals +FSTRING_START : FSTRING_PREFIX STRING_QUOTES; // pushMode(...._FSTRING_MODE) is called in PythonLexerBase +TSTRING_START : TSTRING_PREFIX STRING_QUOTES; // pushMode(...._TSTRING_MODE) is called in PythonLexerBase -// ************************* -// abbreviations for FSTRING -// ************************* -// SQ1__FSTRING = short single quoted formatted string: f'abc' -// DQ1__FSTRING = short double quoted formatted string: f"abc" -// SQ1R_FSTRING = short single quoted raw formatted string: rf'abc' -// DQ1R_FSTRING = short double quoted raw formatted string: rf"abc" -// -// SQ3__FSTRING = long single quoted formatted string: f'''abc''' -// DQ3__FSTRING = long double quoted formatted string: f"""abc""" -// SQ3R_FSTRING = long single quoted raw formatted string: rf'''abc''' -// DQ3R_FSTRING = long double quoted raw formatted string: rf"""abc""" - -// https://docs.python.org/3.13/reference/lexical_analysis.html#formatted-string-literals -FSTRING_START : FSTRING_PREFIX (['] - | ["] - | ['][']['] - | ["]["]["]) - ; // pushMode(????_FSTRING_MODE) will be called in PythonLexerBase class - -// catch the unrecognized characters -ERRORTOKEN : . ; // PythonLexerBase class will report an error about this (the ERRORTOKEN will also cause an error in the parser) - +// catch unrecognized characters +ERRORTOKEN : . ; // the PythonLexerBase class reports a lexer error for them (ERRORTOKEN also triggers a parser error) /* - * other lexer modes + * lexer modes for interpolation string literals */ +// ********************************************************************** +// Abbreviations for interpolation string literals (f-strings, t-strings) +// ********************************************************************** +// SQ1__ISTRING = short single quoted interpolation string, e.g.: f'Hello {name}' +// DQ1__ISTRING = short double quoted interpolation string, e.g.: f"Hello {name}" +// SQ1R_ISTRING = short single quoted raw interpolation string, e.g.: rf'Hello {name}' +// DQ1R_ISTRING = short double quoted raw interpolation string, e.g.: rf"Hello {name}" +// +// SQ3__ISTRING = long single quoted interpolation string, e.g.: f'''Hello {name}''' +// DQ3__ISTRING = long double quoted interpolation string, e.g.: f"""Hello {name}""" +// SQ3R_ISTRING = long single quoted raw interpolation string, e.g.: rf'''Hello {name}''' +// DQ3R_ISTRING = long double quoted raw interpolation string, e.g.: rf"""Hello {name}""" + mode SQ1__FSTRING_MODE; - SQ1__FSTRING_END : ['] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class - SQ1__FSTRING_MIDDLE : SQ1__FSTRING_ITEM -> type(FSTRING_MIDDLE); - SQ1__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1__FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class + SQ1__FSTRING_END : ['] -> type(FSTRING_END); // popMode() is called in PythonLexerBase + SQ1__FSTRING_MIDDLE : SQ1__ISTRING_ITEM -> type(FSTRING_MIDDLE); + SQ1__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1__FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + SQ1__FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode SQ1__TSTRING_MODE; + SQ1__TSTRING_END : ['] -> type(TSTRING_END); // popMode() is called in PythonLexerBase + SQ1__TSTRING_MIDDLE : SQ1__ISTRING_ITEM -> type(TSTRING_MIDDLE); + SQ1__TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1__TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + SQ1__TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + + mode SQ1R_FSTRING_MODE; - SQ1R_FSTRING_END : ['] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class - SQ1R_FSTRING_MIDDLE : SQ1R_FSTRING_ITEM -> type(FSTRING_MIDDLE); - SQ1R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1R_FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class + SQ1R_FSTRING_END : ['] -> type(FSTRING_END); // popMode() is called in PythonLexerBase + SQ1R_FSTRING_MIDDLE : SQ1R_ISTRING_ITEM -> type(FSTRING_MIDDLE); + SQ1R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + SQ1R_FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode SQ1R_TSTRING_MODE; + SQ1R_TSTRING_END : ['] -> type(TSTRING_END); // popMode() is called in PythonLexerBase + SQ1R_TSTRING_MIDDLE : SQ1R_ISTRING_ITEM -> type(TSTRING_MIDDLE); + SQ1R_TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + SQ1R_TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + + mode DQ1__FSTRING_MODE; - DQ1__FSTRING_END : ["] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class - DQ1__FSTRING_MIDDLE : DQ1__FSTRING_ITEM -> type(FSTRING_MIDDLE); - DQ1__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1__FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class + DQ1__FSTRING_END : ["] -> type(FSTRING_END); // popMode() is called in PythonLexerBase + DQ1__FSTRING_MIDDLE : DQ1__ISTRING_ITEM -> type(FSTRING_MIDDLE); + DQ1__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1__FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + DQ1__FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode DQ1__TSTRING_MODE; + DQ1__TSTRING_END : ["] -> type(TSTRING_END); // popMode() is called in PythonLexerBase + DQ1__TSTRING_MIDDLE : DQ1__ISTRING_ITEM -> type(TSTRING_MIDDLE); + DQ1__TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1__TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + DQ1__TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + + mode DQ1R_FSTRING_MODE; - DQ1R_FSTRING_END : ["] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class - DQ1R_FSTRING_MIDDLE : DQ1R_FSTRING_ITEM -> type(FSTRING_MIDDLE); - DQ1R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1R_FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class + DQ1R_FSTRING_END : ["] -> type(FSTRING_END); // popMode() is called in PythonLexerBase + DQ1R_FSTRING_MIDDLE : DQ1R_ISTRING_ITEM -> type(FSTRING_MIDDLE); + DQ1R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + DQ1R_FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode DQ1R_TSTRING_MODE; + DQ1R_TSTRING_END : ["] -> type(TSTRING_END); // popMode() is called in PythonLexerBase + DQ1R_TSTRING_MIDDLE : DQ1R_ISTRING_ITEM -> type(TSTRING_MIDDLE); + DQ1R_TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + DQ1R_TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + + mode SQ3__FSTRING_MODE; - SQ3__FSTRING_END : ['][']['] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class - SQ3__FSTRING_MIDDLE : SQ3__FSTRING_ITEM -> type(FSTRING_MIDDLE); - SQ3__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3__FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class + SQ3__FSTRING_END : ['][']['] -> type(FSTRING_END); // popMode() is called in PythonLexerBase + SQ3__FSTRING_MIDDLE : SQ3__ISTRING_ITEM -> type(FSTRING_MIDDLE); + SQ3__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3__FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + SQ3__FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode SQ3__TSTRING_MODE; + SQ3__TSTRING_END : ['][']['] -> type(TSTRING_END); // popMode() is called in PythonLexerBase + SQ3__TSTRING_MIDDLE : SQ3__ISTRING_ITEM -> type(TSTRING_MIDDLE); + SQ3__TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3__TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + SQ3__TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + + mode SQ3R_FSTRING_MODE; - SQ3R_FSTRING_END : ['][']['] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class - SQ3R_FSTRING_MIDDLE : SQ3R_FSTRING_ITEM -> type(FSTRING_MIDDLE); - SQ3R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3R_FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class + SQ3R_FSTRING_END : ['][']['] -> type(FSTRING_END); // popMode() is called in PythonLexerBase + SQ3R_FSTRING_MIDDLE : SQ3R_ISTRING_ITEM -> type(FSTRING_MIDDLE); + SQ3R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + SQ3R_FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode SQ3R_TSTRING_MODE; + SQ3R_TSTRING_END : ['][']['] -> type(TSTRING_END); // popMode() is called in PythonLexerBase + SQ3R_TSTRING_MIDDLE : SQ3R_ISTRING_ITEM -> type(TSTRING_MIDDLE); + SQ3R_TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + SQ3R_TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + + mode DQ3__FSTRING_MODE; - DQ3__FSTRING_END : ["]["]["] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class - DQ3__FSTRING_MIDDLE : DQ3__FSTRING_ITEM -> type(FSTRING_MIDDLE); - DQ3__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3__FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class + DQ3__FSTRING_END : ["]["]["] -> type(FSTRING_END); // popMode() is called in PythonLexerBase + DQ3__FSTRING_MIDDLE : DQ3__ISTRING_ITEM -> type(FSTRING_MIDDLE); + DQ3__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3__FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + DQ3__FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode DQ3__TSTRING_MODE; + DQ3__TSTRING_END : ["]["]["] -> type(TSTRING_END); // popMode() is called in PythonLexerBase + DQ3__TSTRING_MIDDLE : DQ3__ISTRING_ITEM -> type(TSTRING_MIDDLE); + DQ3__TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3__TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + DQ3__TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + + mode DQ3R_FSTRING_MODE; - DQ3R_FSTRING_END : ["]["]["] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class - DQ3R_FSTRING_MIDDLE : DQ3R_FSTRING_ITEM -> type(FSTRING_MIDDLE); - DQ3R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3R_FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class - - -mode SQ1__FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon - SQ1__FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ1__FSTRING_PART+ -> type(FSTRING_MIDDLE); - SQ1__FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class - SQ1__FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to SQ1__FSTRING_MODE by PythonLexerBase class - -mode SQ1R_FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon - SQ1R_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ1R_FSTRING_PART+ -> type(FSTRING_MIDDLE); - SQ1R_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class - SQ1R_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to SQ1R_FSTRING_MODEby PythonLexerBase class - -mode DQ1__FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon - DQ1__FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ1__FSTRING_PART+ -> type(FSTRING_MIDDLE); - DQ1__FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class - DQ1__FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to DQ1__FSTRING_MODE by PythonLexerBase class - -mode DQ1R_FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon - DQ1R_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ1R_FSTRING_PART+ -> type(FSTRING_MIDDLE); - DQ1R_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class - DQ1R_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to DQ1R_FSTRING_MODE by PythonLexerBase class - -mode SQ3__FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon - SQ3__FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ3__FSTRING_PART+ -> type(FSTRING_MIDDLE); - SQ3__FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class - SQ3__FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to SQ3__FSTRING_MODE by PythonLexerBase class - -mode SQ3R_FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon - SQ3R_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ3R_FSTRING_PART+ -> type(FSTRING_MIDDLE); - SQ3R_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class - SQ3R_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to SQ3R_FSTRING_MODE by PythonLexerBase class - -mode DQ3__FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon - DQ3__FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ3__FSTRING_PART+ -> type(FSTRING_MIDDLE); - DQ3__FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class - DQ3__FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to DQ3__FSTRING_MODE by PythonLexerBase class - -mode DQ3R_FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon - DQ3R_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ3R_FSTRING_PART+ -> type(FSTRING_MIDDLE); - DQ3R_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class - DQ3R_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to DQ3R_FSTRING_MODE by PythonLexerBase class + DQ3R_FSTRING_END : ["]["]["] -> type(FSTRING_END); // popMode() is called in PythonLexerBase + DQ3R_FSTRING_MIDDLE : DQ3R_ISTRING_ITEM -> type(FSTRING_MIDDLE); + DQ3R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + DQ3R_FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode DQ3R_TSTRING_MODE; + DQ3R_TSTRING_END : ["]["]["] -> type(TSTRING_END); // popMode() is called in PythonLexerBase + DQ3R_TSTRING_MIDDLE : DQ3R_ISTRING_ITEM -> type(TSTRING_MIDDLE); + DQ3R_TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase + DQ3R_TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN); + + + +// *** format specification modes for interpolated strings *** +mode SQ1__FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + SQ1__FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ1__ISTRING_PART+ -> type(FSTRING_MIDDLE); + SQ1__FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + SQ1__FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ1__FSTRING_MODE is called in PythonLexerBase + SQ1__FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode SQ1__TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + SQ1__TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : SQ1__ISTRING_PART+ -> type(TSTRING_MIDDLE); + SQ1__TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + SQ1__TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ1__TSTRING_MODE is called in PythonLexerBase + SQ1__TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + + + +mode SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + SQ1R_FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ1R_ISTRING_PART+ -> type(FSTRING_MIDDLE); + SQ1R_FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + SQ1R_FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ1R_FSTRING_MODE is called in PythonLexerBase + SQ1R_FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + SQ1R_TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : SQ1R_ISTRING_PART+ -> type(TSTRING_MIDDLE); + SQ1R_TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + SQ1R_TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ1R_TSTRING_MODE is called in PythonLexerBase + SQ1R_TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + + + +mode DQ1__FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + DQ1__FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ1__ISTRING_PART+ -> type(FSTRING_MIDDLE); + DQ1__FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + DQ1__FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ1__FSTRING_MODE is called in PythonLexerBase + DQ1__FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode DQ1__TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + DQ1__TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : DQ1__ISTRING_PART+ -> type(TSTRING_MIDDLE); + DQ1__TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + DQ1__TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ1__TSTRING_MODE is called in PythonLexerBase + DQ1__TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + + + +mode DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + DQ1R_FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ1R_ISTRING_PART+ -> type(FSTRING_MIDDLE); + DQ1R_FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + DQ1R_FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ1R_FSTRING_MODE is called in PythonLexerBase + DQ1R_FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + DQ1R_TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : DQ1R_ISTRING_PART+ -> type(TSTRING_MIDDLE); + DQ1R_TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + DQ1R_TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ1R_TSTRING_MODE is called in PythonLexerBase + DQ1R_TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + + + +mode SQ3__FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + SQ3__FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ3__ISTRING_PART+ -> type(FSTRING_MIDDLE); + SQ3__FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + SQ3__FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ3__FSTRING_MODE is called in PythonLexerBase + SQ3__FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode SQ3__TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + SQ3__TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : SQ3__ISTRING_PART+ -> type(TSTRING_MIDDLE); + SQ3__TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + SQ3__TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ3__TSTRING_MODE is called in PythonLexerBase + SQ3__TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + + + +mode SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + SQ3R_FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ3R_ISTRING_PART+ -> type(FSTRING_MIDDLE); + SQ3R_FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + SQ3R_FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ3R_FSTRING_MODE is called in PythonLexerBase + SQ3R_FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + SQ3R_TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : SQ3R_ISTRING_PART+ -> type(TSTRING_MIDDLE); + SQ3R_TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + SQ3R_TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ3R_TSTRING_MODE is called in PythonLexerBase + SQ3R_TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + + + +mode DQ3__FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + DQ3__FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ3__ISTRING_PART+ -> type(FSTRING_MIDDLE); + DQ3__FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + DQ3__FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ3__FSTRING_MODE is called in PythonLexerBase + DQ3__FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode DQ3__TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + DQ3__TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : DQ3__ISTRING_PART+ -> type(TSTRING_MIDDLE); + DQ3__TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + DQ3__TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ3__TSTRING_MODE is called in PythonLexerBase + DQ3__TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + + + +mode DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + DQ3R_FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ3R_ISTRING_PART+ -> type(FSTRING_MIDDLE); + DQ3R_FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + DQ3R_FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ3R_FSTRING_MODE is called in PythonLexerBase + DQ3R_FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + +mode DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon + DQ3R_TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : DQ3R_ISTRING_PART+ -> type(TSTRING_MIDDLE); + DQ3R_TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class + DQ3R_TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ3R_TSTRING_MODE is called in PythonLexerBase + DQ3R_TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN); + /* * fragments */ -// https://docs.python.org/3.13/reference/lexical_analysis.html#literals +// https://docs.python.org/3.14/reference/lexical_analysis.html#literals // -// https://docs.python.org/3.13/reference/lexical_analysis.html#string-and-bytes-literals +// https://docs.python.org/3.14/reference/lexical_analysis.html#string-and-bytes-literals fragment STRING_LITERAL : STRING_PREFIX? (SHORT_STRING | LONG_STRING); fragment STRING_PREFIX options { caseInsensitive=true; } : 'r' | 'u'; // 'r' | 'u' | 'R' | 'U' @@ -306,7 +451,7 @@ fragment LONG__STRING_CHAR : ~[\\]; // -// https://docs.python.org/3.13/reference/lexical_analysis.html#string-and-bytes-literals +// https://docs.python.org/3.14/reference/lexical_analysis.html#string-and-bytes-literals fragment BYTES_LITERAL : BYTES_PREFIX (SHORT_BYTES | LONG_BYTES); fragment BYTES_PREFIX options { caseInsensitive=true; } : 'b' | 'br' | 'rb'; // 'b' | 'B' | 'br' | 'Br' | 'bR' | 'BR' | 'rb' | 'rB' | 'Rb' | 'RB' @@ -344,57 +489,69 @@ fragment SHORT_DOUBLE_QUOTED_BYTES_CHAR // fragment BYTES_ESCAPE_SEQ : '\\' [\u0000-\u007F]; // "\" -// https://docs.python.org/3.13/reference/lexical_analysis.html#formatted-string-literals -// https://docs.python.org/3.13/library/string.html#format-specification-mini-language -// 'f' | 'F' | 'fr' | 'Fr' | 'fR' | 'FR' | 'rf' | 'rF' | 'Rf' | 'RF' -fragment FSTRING_PREFIX options { caseInsensitive=true; } : 'f' | 'fr' | 'rf'; +// https://docs.python.org/3.14/reference/lexical_analysis.html#formatted-string-literals +fragment FSTRING_PREFIX options { caseInsensitive=true; } : 'f' | 'fr' | 'rf'; // 'f' | 'F' | 'fr' | 'Fr' | 'fR' | 'FR' | 'rf' | 'rF' | 'Rf' | 'RF' +fragment TSTRING_PREFIX options { caseInsensitive=true; } : 't' | 'tr' | 'rt'; // 't' | 'T' | 'tr' | 'Tr' | 'tR' | 'TR' | 'rt' | 'rT' | 'Rt' | 'RT' +fragment STRING_QUOTES : ['] + | ["] + | ['][']['] + | ["]["]["] + ; -fragment SQ1__FSTRING_ITEM : (SQ1__FSTRING_PART+ TERMINATING_FSTRING_MIDDLE?) | TERMINATING_FSTRING_MIDDLE; -fragment DQ1__FSTRING_ITEM : (DQ1__FSTRING_PART+ TERMINATING_FSTRING_MIDDLE?) | TERMINATING_FSTRING_MIDDLE; -fragment SQ3__FSTRING_ITEM : (SQ3__FSTRING_PART+ TERMINATING_SQ3__FSTRING_MIDDLE?) | TERMINATING_SQ3__FSTRING_MIDDLE; -fragment DQ3__FSTRING_ITEM : (DQ3__FSTRING_PART+ TERMINATING_DQ3__FSTRING_MIDDLE?) | TERMINATING_DQ3__FSTRING_MIDDLE; +fragment SQ1__ISTRING_ITEM : (SQ1__ISTRING_PART+ TERMINATING_ISTRING_MIDDLE?) | TERMINATING_ISTRING_MIDDLE; +fragment DQ1__ISTRING_ITEM : (DQ1__ISTRING_PART+ TERMINATING_ISTRING_MIDDLE?) | TERMINATING_ISTRING_MIDDLE; +fragment SQ3__ISTRING_ITEM : (SQ3__ISTRING_PART+ TERMINATING_SQ3__ISTRING_MIDDLE?) | TERMINATING_SQ3__ISTRING_MIDDLE; +fragment DQ3__ISTRING_ITEM : (DQ3__ISTRING_PART+ TERMINATING_DQ3__ISTRING_MIDDLE?) | TERMINATING_DQ3__ISTRING_MIDDLE; -fragment SQ1R_FSTRING_ITEM : (SQ1R_FSTRING_PART+ TERMINATING_FSTRING_MIDDLE_RAW?) | TERMINATING_FSTRING_MIDDLE_RAW; -fragment DQ1R_FSTRING_ITEM : (DQ1R_FSTRING_PART+ TERMINATING_FSTRING_MIDDLE_RAW?) | TERMINATING_FSTRING_MIDDLE_RAW; -fragment SQ3R_FSTRING_ITEM : (SQ3R_FSTRING_PART+ TERMINATING_SQ3R_FSTRING_MIDDLE?) | TERMINATING_SQ3R_FSTRING_MIDDLE; -fragment DQ3R_FSTRING_ITEM : (DQ3R_FSTRING_PART+ TERMINATING_DQ3R_FSTRING_MIDDLE?) | TERMINATING_DQ3R_FSTRING_MIDDLE; +fragment SQ1R_ISTRING_ITEM : (SQ1R_ISTRING_PART+ TERMINATING_ISTRING_MIDDLE_RAW?) | TERMINATING_ISTRING_MIDDLE_RAW; +fragment DQ1R_ISTRING_ITEM : (DQ1R_ISTRING_PART+ TERMINATING_ISTRING_MIDDLE_RAW?) | TERMINATING_ISTRING_MIDDLE_RAW; +fragment SQ3R_ISTRING_ITEM : (SQ3R_ISTRING_PART+ TERMINATING_SQ3R_ISTRING_MIDDLE?) | TERMINATING_SQ3R_ISTRING_MIDDLE; +fragment DQ3R_ISTRING_ITEM : (DQ3R_ISTRING_PART+ TERMINATING_DQ3R_ISTRING_MIDDLE?) | TERMINATING_DQ3R_ISTRING_MIDDLE; -fragment SQ1__FSTRING_PART : SQ1_FSTRING_CHAR | FSTRING_ESCAPE_SEQ; -fragment DQ1__FSTRING_PART : DQ1_FSTRING_CHAR | FSTRING_ESCAPE_SEQ; -fragment SQ3__FSTRING_PART : ONE_OR_TWO_SQUOTE? (SQ3_FSTRING_CHAR | FSTRING_ESCAPE_SEQ); -fragment DQ3__FSTRING_PART : ONE_OR_TWO_DQUOTE? (DQ3_FSTRING_CHAR | FSTRING_ESCAPE_SEQ); -fragment SQ1R_FSTRING_PART : SQ1_FSTRING_CHAR | FSTRING_ESCAPE_SEQ_RAW; -fragment DQ1R_FSTRING_PART : DQ1_FSTRING_CHAR | FSTRING_ESCAPE_SEQ_RAW; -fragment SQ3R_FSTRING_PART : ONE_OR_TWO_SQUOTE? (SQ3_FSTRING_CHAR | FSTRING_ESCAPE_SEQ_RAW); -fragment DQ3R_FSTRING_PART : ONE_OR_TWO_DQUOTE? (DQ3_FSTRING_CHAR | FSTRING_ESCAPE_SEQ_RAW); -fragment SQ1_FSTRING_CHAR : ~[\\{}'\r\n]; // -fragment DQ1_FSTRING_CHAR : ~[\\{}"\r\n]; // -fragment SQ3_FSTRING_CHAR : ~[\\{}']; // -fragment DQ3_FSTRING_CHAR : ~[\\{}"]; // +fragment SQ1__ISTRING_PART : SQ1_ISTRING_CHAR | ISTRING_ESCAPE_SEQ; +fragment DQ1__ISTRING_PART : DQ1_ISTRING_CHAR | ISTRING_ESCAPE_SEQ; +fragment SQ3__ISTRING_PART : ONE_OR_TWO_SQUOTE? (SQ3_ISTRING_CHAR | ISTRING_ESCAPE_SEQ); +fragment DQ3__ISTRING_PART : ONE_OR_TWO_DQUOTE? (DQ3_ISTRING_CHAR | ISTRING_ESCAPE_SEQ); -fragment TERMINATING_SQ3__FSTRING_MIDDLE : ONE_OR_TWO_SQUOTE '{' | ONE_OR_TWO_SQUOTE? TERMINATING_FSTRING_MIDDLE; -fragment TERMINATING_DQ3__FSTRING_MIDDLE : ONE_OR_TWO_DQUOTE '{' | ONE_OR_TWO_DQUOTE? TERMINATING_FSTRING_MIDDLE; -fragment TERMINATING_SQ3R_FSTRING_MIDDLE : ONE_OR_TWO_SQUOTE '{' | ONE_OR_TWO_SQUOTE? TERMINATING_FSTRING_MIDDLE_RAW; -fragment TERMINATING_DQ3R_FSTRING_MIDDLE : ONE_OR_TWO_DQUOTE '{' | ONE_OR_TWO_DQUOTE? TERMINATING_FSTRING_MIDDLE_RAW; +fragment SQ1R_ISTRING_PART : SQ1_ISTRING_CHAR | ISTRING_ESCAPE_SEQ_RAW; +fragment DQ1R_ISTRING_PART : DQ1_ISTRING_CHAR | ISTRING_ESCAPE_SEQ_RAW; +fragment SQ3R_ISTRING_PART : ONE_OR_TWO_SQUOTE? (SQ3_ISTRING_CHAR | ISTRING_ESCAPE_SEQ_RAW); +fragment DQ3R_ISTRING_PART : ONE_OR_TWO_DQUOTE? (DQ3_ISTRING_CHAR | ISTRING_ESCAPE_SEQ_RAW); -fragment TERMINATING_FSTRING_MIDDLE : '\\'? DOUBLE_BRACE | '\\{' | ESCAPE_SEQ_NAMED_CHAR; -fragment TERMINATING_FSTRING_MIDDLE_RAW : '\\'? DOUBLE_BRACE | '\\{' ; // https://docs.python.org/3/faq/design.html#why-can-t-raw-strings-r-strings-end-with-a-backslash -fragment FSTRING_ESCAPE_SEQ : ESCAPE_SEQ_NEWLINE | '\\' ~[{}N]; // f"\\}" causes a lexer error -fragment FSTRING_ESCAPE_SEQ_RAW : ESCAPE_SEQ_NEWLINE | '\\' ~[{}]; // fr"\}" causes a lexer error + +fragment SQ1_ISTRING_CHAR : ~[\\{}'\r\n]; // +fragment DQ1_ISTRING_CHAR : ~[\\{}"\r\n]; // +fragment SQ3_ISTRING_CHAR : ~[\\{}']; // +fragment DQ3_ISTRING_CHAR : ~[\\{}"]; // + + + +fragment TERMINATING_SQ3__ISTRING_MIDDLE : ONE_OR_TWO_SQUOTE_LBRACE | ONE_OR_TWO_SQUOTE? TERMINATING_ISTRING_MIDDLE; +fragment TERMINATING_DQ3__ISTRING_MIDDLE : ONE_OR_TWO_DQUOTE_LBRACE | ONE_OR_TWO_DQUOTE? TERMINATING_ISTRING_MIDDLE; +fragment TERMINATING_SQ3R_ISTRING_MIDDLE : ONE_OR_TWO_SQUOTE_LBRACE | ONE_OR_TWO_SQUOTE? TERMINATING_ISTRING_MIDDLE_RAW; +fragment TERMINATING_DQ3R_ISTRING_MIDDLE : ONE_OR_TWO_DQUOTE_LBRACE | ONE_OR_TWO_DQUOTE? TERMINATING_ISTRING_MIDDLE_RAW; +fragment ONE_OR_TWO_SQUOTE_LBRACE : ONE_OR_TWO_SQUOTE '{'; +fragment ONE_OR_TWO_DQUOTE_LBRACE : ONE_OR_TWO_DQUOTE '{'; + +fragment TERMINATING_ISTRING_MIDDLE : TERMINATING_ISTRING_MIDDLE_RAW | ESCAPE_SEQ_NAMED_CHAR; +fragment TERMINATING_ISTRING_MIDDLE_RAW : '\\'? DOUBLE_BRACE | '\\{' ; // https://docs.python.org/3/faq/design.html#why-can-t-raw-strings-r-strings-end-with-a-backslash + +fragment ISTRING_ESCAPE_SEQ : ESCAPE_SEQ_NEWLINE | '\\' ~[{}N]; // f"\\}" causes a lexer error +fragment ISTRING_ESCAPE_SEQ_RAW : ESCAPE_SEQ_NEWLINE | '\\' ~[{}]; // fr"\}" causes a lexer error fragment ONE_OR_TWO_SQUOTE : ['][']?; fragment ONE_OR_TWO_DQUOTE : ["]["]?; -fragment DOUBLE_BRACE : '{{' | '}}'; // will be replaced to single brace in PythonLexerBase class - -fragment ESCAPE_SEQ_NAMED_CHAR : '\\N{' .*? '}'; // an escape sequence for a character by a name from the Unicode database -fragment ESCAPE_SEQ_NEWLINE : BACKSLASH_NEWLINE; // it is a kind of line continuation for string literals (backslash and newline will be ignored) +fragment DOUBLE_BRACE : '{{' | '}}'; // PythonLexerBase replaces double brace with single brace +fragment ESCAPE_SEQ_NAMED_CHAR : '\\N{' .*? '}'; // an escape sequence for a Unicode character specified by name +fragment ESCAPE_SEQ_NEWLINE : BACKSLASH_NEWLINE; // this escape sequence acts as a line continuation in string literals + // the backslash and newline are ignored by the Python interpreter fragment BACKSLASH_NEWLINE : '\\' NEWLINE; -// https://docs.python.org/3.13/reference/lexical_analysis.html#integer-literals +// https://docs.python.org/3.14/reference/lexical_analysis.html#integer-literals fragment INTEGER : DEC_INTEGER | BIN_INTEGER | OCT_INTEGER | HEX_INTEGER; fragment DEC_INTEGER : NON_ZERO_DIGIT ('_'? DIGIT)* | '0'+ ('_'? '0')*; fragment BIN_INTEGER : '0' ('b' | 'B') ('_'? BIN_DIGIT)+; @@ -406,7 +563,7 @@ fragment BIN_DIGIT : '0' | '1'; fragment OCT_DIGIT : [0-7]; fragment HEX_DIGIT : DIGIT | [a-f] | [A-F]; -// https://docs.python.org/3.13/reference/lexical_analysis.html#floating-point-literals +// https://docs.python.org/3.14/reference/lexical_analysis.html#floating-point-literals fragment FLOAT_NUMBER : POINT_FLOAT | EXPONENT_FLOAT; fragment POINT_FLOAT : DIGIT_PART? FRACTION | DIGIT_PART '.'; fragment EXPONENT_FLOAT : (DIGIT_PART | POINT_FLOAT) EXPONENT; @@ -414,11 +571,11 @@ fragment DIGIT_PART : DIGIT ('_'? DIGIT)*; fragment FRACTION : '.' DIGIT_PART; fragment EXPONENT : ('e' | 'E') ('+' | '-')? DIGIT_PART; -// https://docs.python.org/3.13/reference/lexical_analysis.html#imaginary-literals +// https://docs.python.org/3.14/reference/lexical_analysis.html#imaginary-literals fragment IMAG_NUMBER : (FLOAT_NUMBER | DIGIT_PART) ('j' | 'J'); -// https://github.com/RobEin/ANTLR4-parser-for-Python-3.13/tree/main/valid_chars_in_py_identifiers -fragment ID_CONTINUE +// https://github.com/RobEin/ANTLR4-parser-for-Python-3.14/tree/main/utils/valid_chars_in_py_identifiers +fragment ID_CONTINUE // for Python 3.14.2 : ID_START | '\u{0030}' .. '\u{0039}' | '\u{00B7}' @@ -449,7 +606,7 @@ fragment ID_CONTINUE | '\u{0825}' .. '\u{0827}' | '\u{0829}' .. '\u{082D}' | '\u{0859}' .. '\u{085B}' - | '\u{0898}' .. '\u{089F}' + | '\u{0897}' .. '\u{089F}' | '\u{08CA}' .. '\u{08E1}' | '\u{08E3}' .. '\u{0903}' | '\u{093A}' .. '\u{093C}' @@ -666,8 +823,10 @@ fragment ID_CONTINUE | '\u{10AE5}' .. '\u{10AE6}' | '\u{10D24}' .. '\u{10D27}' | '\u{10D30}' .. '\u{10D39}' + | '\u{10D40}' .. '\u{10D49}' + | '\u{10D69}' .. '\u{10D6D}' | '\u{10EAB}' .. '\u{10EAC}' - | '\u{10EFD}' .. '\u{10EFF}' + | '\u{10EFC}' .. '\u{10EFF}' | '\u{10F46}' .. '\u{10F50}' | '\u{10F82}' .. '\u{10F85}' | '\u{11000}' .. '\u{11002}' @@ -701,6 +860,13 @@ fragment ID_CONTINUE | '\u{11362}' .. '\u{11363}' | '\u{11366}' .. '\u{1136C}' | '\u{11370}' .. '\u{11374}' + | '\u{113B8}' .. '\u{113C0}' + | '\u{113C2}' + | '\u{113C5}' + | '\u{113C7}' .. '\u{113CA}' + | '\u{113CC}' .. '\u{113D0}' + | '\u{113D2}' + | '\u{113E1}' .. '\u{113E2}' | '\u{11435}' .. '\u{11446}' | '\u{11450}' .. '\u{11459}' | '\u{1145E}' @@ -713,6 +879,7 @@ fragment ID_CONTINUE | '\u{11650}' .. '\u{11659}' | '\u{116AB}' .. '\u{116B7}' | '\u{116C0}' .. '\u{116C9}' + | '\u{116D0}' .. '\u{116E3}' | '\u{1171D}' .. '\u{1172B}' | '\u{11730}' .. '\u{11739}' | '\u{1182C}' .. '\u{1183A}' @@ -732,6 +899,7 @@ fragment ID_CONTINUE | '\u{11A47}' | '\u{11A51}' .. '\u{11A5B}' | '\u{11A8A}' .. '\u{11A99}' + | '\u{11BF0}' .. '\u{11BF9}' | '\u{11C2F}' .. '\u{11C36}' | '\u{11C38}' .. '\u{11C3F}' | '\u{11C50}' .. '\u{11C59}' @@ -752,20 +920,23 @@ fragment ID_CONTINUE | '\u{11F03}' | '\u{11F34}' .. '\u{11F3A}' | '\u{11F3E}' .. '\u{11F42}' - | '\u{11F50}' .. '\u{11F59}' + | '\u{11F50}' .. '\u{11F5A}' | '\u{13440}' | '\u{13447}' .. '\u{13455}' + | '\u{1611E}' .. '\u{16139}' | '\u{16A60}' .. '\u{16A69}' | '\u{16AC0}' .. '\u{16AC9}' | '\u{16AF0}' .. '\u{16AF4}' | '\u{16B30}' .. '\u{16B36}' | '\u{16B50}' .. '\u{16B59}' + | '\u{16D70}' .. '\u{16D79}' | '\u{16F4F}' | '\u{16F51}' .. '\u{16F87}' | '\u{16F8F}' .. '\u{16F92}' | '\u{16FE4}' | '\u{16FF0}' .. '\u{16FF1}' | '\u{1BC9D}' .. '\u{1BC9E}' + | '\u{1CCF0}' .. '\u{1CCF9}' | '\u{1CF00}' .. '\u{1CF2D}' | '\u{1CF30}' .. '\u{1CF46}' | '\u{1D165}' .. '\u{1D169}' @@ -792,6 +963,8 @@ fragment ID_CONTINUE | '\u{1E2AE}' | '\u{1E2EC}' .. '\u{1E2F9}' | '\u{1E4EC}' .. '\u{1E4F9}' + | '\u{1E5EE}' .. '\u{1E5EF}' + | '\u{1E5F1}' .. '\u{1E5FA}' | '\u{1E8D0}' .. '\u{1E8D6}' | '\u{1E944}' .. '\u{1E94A}' | '\u{1E950}' .. '\u{1E959}' @@ -799,7 +972,7 @@ fragment ID_CONTINUE | '\u{E0100}' .. '\u{E01EF}' ; -fragment ID_START +fragment ID_START // for Python 3.14.2 : '\u{0041}' .. '\u{005A}' | '\u{005F}' | '\u{0061}' .. '\u{007A}' @@ -1025,7 +1198,7 @@ fragment ID_START | '\u{1C00}' .. '\u{1C23}' | '\u{1C4D}' .. '\u{1C4F}' | '\u{1C5A}' .. '\u{1C7D}' - | '\u{1C80}' .. '\u{1C88}' + | '\u{1C80}' .. '\u{1C8A}' | '\u{1C90}' .. '\u{1CBA}' | '\u{1CBD}' .. '\u{1CBF}' | '\u{1CE9}' .. '\u{1CEC}' @@ -1108,10 +1281,10 @@ fragment ID_START | '\u{A6A0}' .. '\u{A6EF}' | '\u{A717}' .. '\u{A71F}' | '\u{A722}' .. '\u{A788}' - | '\u{A78B}' .. '\u{A7CA}' + | '\u{A78B}' .. '\u{A7CD}' | '\u{A7D0}' .. '\u{A7D1}' | '\u{A7D3}' - | '\u{A7D5}' .. '\u{A7D9}' + | '\u{A7D5}' .. '\u{A7DC}' | '\u{A7F2}' .. '\u{A801}' | '\u{A803}' .. '\u{A805}' | '\u{A807}' .. '\u{A80A}' @@ -1216,6 +1389,7 @@ fragment ID_START | '\u{105A3}' .. '\u{105B1}' | '\u{105B3}' .. '\u{105B9}' | '\u{105BB}' .. '\u{105BC}' + | '\u{105C0}' .. '\u{105F3}' | '\u{10600}' .. '\u{10736}' | '\u{10740}' .. '\u{10755}' | '\u{10760}' .. '\u{10767}' @@ -1252,8 +1426,11 @@ fragment ID_START | '\u{10C80}' .. '\u{10CB2}' | '\u{10CC0}' .. '\u{10CF2}' | '\u{10D00}' .. '\u{10D23}' + | '\u{10D4A}' .. '\u{10D65}' + | '\u{10D6F}' .. '\u{10D85}' | '\u{10E80}' .. '\u{10EA9}' | '\u{10EB0}' .. '\u{10EB1}' + | '\u{10EC2}' .. '\u{10EC4}' | '\u{10F00}' .. '\u{10F1C}' | '\u{10F27}' | '\u{10F30}' .. '\u{10F45}' @@ -1292,6 +1469,13 @@ fragment ID_START | '\u{1133D}' | '\u{11350}' | '\u{1135D}' .. '\u{11361}' + | '\u{11380}' .. '\u{11389}' + | '\u{1138B}' + | '\u{1138E}' + | '\u{11390}' .. '\u{113B5}' + | '\u{113B7}' + | '\u{113D1}' + | '\u{113D3}' | '\u{11400}' .. '\u{11434}' | '\u{11447}' .. '\u{1144A}' | '\u{1145F}' .. '\u{11461}' @@ -1326,6 +1510,7 @@ fragment ID_START | '\u{11A5C}' .. '\u{11A89}' | '\u{11A9D}' | '\u{11AB0}' .. '\u{11AF8}' + | '\u{11BC0}' .. '\u{11BE0}' | '\u{11C00}' .. '\u{11C08}' | '\u{11C0A}' .. '\u{11C2E}' | '\u{11C40}' @@ -1349,7 +1534,9 @@ fragment ID_START | '\u{12F90}' .. '\u{12FF0}' | '\u{13000}' .. '\u{1342F}' | '\u{13441}' .. '\u{13446}' + | '\u{13460}' .. '\u{143FA}' | '\u{14400}' .. '\u{14646}' + | '\u{16100}' .. '\u{1611D}' | '\u{16800}' .. '\u{16A38}' | '\u{16A40}' .. '\u{16A5E}' | '\u{16A70}' .. '\u{16ABE}' @@ -1358,6 +1545,7 @@ fragment ID_START | '\u{16B40}' .. '\u{16B43}' | '\u{16B63}' .. '\u{16B77}' | '\u{16B7D}' .. '\u{16B8F}' + | '\u{16D40}' .. '\u{16D6C}' | '\u{16E40}' .. '\u{16E7F}' | '\u{16F00}' .. '\u{16F4A}' | '\u{16F50}' @@ -1366,7 +1554,7 @@ fragment ID_START | '\u{16FE3}' | '\u{17000}' .. '\u{187F7}' | '\u{18800}' .. '\u{18CD5}' - | '\u{18D00}' .. '\u{18D08}' + | '\u{18CFF}' .. '\u{18D08}' | '\u{1AFF0}' .. '\u{1AFF3}' | '\u{1AFF5}' .. '\u{1AFFB}' | '\u{1AFFD}' .. '\u{1AFFE}' @@ -1419,6 +1607,8 @@ fragment ID_START | '\u{1E290}' .. '\u{1E2AD}' | '\u{1E2C0}' .. '\u{1E2EB}' | '\u{1E4D0}' .. '\u{1E4EB}' + | '\u{1E5D0}' .. '\u{1E5ED}' + | '\u{1E5F0}' | '\u{1E7E0}' .. '\u{1E7E6}' | '\u{1E7E8}' .. '\u{1E7EB}' | '\u{1E7ED}' .. '\u{1E7EE}' @@ -1468,4 +1658,4 @@ fragment ID_START | '\u{2F800}' .. '\u{2FA1D}' | '\u{30000}' .. '\u{3134A}' | '\u{31350}' .. '\u{323AF}' - ; + ; \ No newline at end of file diff --git a/python/python3_13/PythonParser.g4 b/python/python3_14/PythonParser.g4 similarity index 93% rename from python/python3_13/PythonParser.g4 rename to python/python3_14/PythonParser.g4 index 4a1059a638..a8b2f3e3a9 100644 --- a/python/python3_13/PythonParser.g4 +++ b/python/python3_14/PythonParser.g4 @@ -21,17 +21,13 @@ THE SOFTWARE. */ /* - * Project : an ANTLR4 parser grammar by the official PEG grammar - * https://github.com/RobEin/ANTLR4-parser-for-Python-3.13 + * Project : an ANTLR4 parser grammar for Python 3 programming language based on the official PEG grammar + * https://github.com/RobEin/ANTLR4-parser-for-Python-3.14 * Developed by : Robert Einhorn * */ - /* - * Contributors : [Willie Shen](https://github.com/Willie169) - */ - -// Python 3.13.2 https://docs.python.org/3.13/reference/grammar.html#full-grammar-specification +// Python 3.14.2 https://docs.python.org/3.14/reference/grammar.html#full-grammar-specification parser grammar PythonParser; @@ -50,10 +46,15 @@ func_type: '(' type_expressions? ')' '->' expression NEWLINE* EOF; statements: statement+; -statement: compound_stmt | simple_stmts; +statement + : compound_stmt + | simple_stmts; + +single_compound_stmt + : compound_stmt; statement_newline - : compound_stmt NEWLINE + : single_compound_stmt NEWLINE | simple_stmts | NEWLINE | EOF; @@ -71,12 +72,12 @@ simple_stmt | return_stmt | import_stmt | raise_stmt - | 'pass' + | pass_stmt | del_stmt | yield_stmt | assert_stmt - | 'break' - | 'continue' + | break_stmt + | continue_stmt | global_stmt | nonlocal_stmt; @@ -98,8 +99,8 @@ assignment : name ':' expression ('=' annotated_rhs )? | ('(' single_target ')' | single_subscript_attribute_target) ':' expression ('=' annotated_rhs )? - | (star_targets '=' )+ (yield_expr | star_expressions) TYPE_COMMENT? - | single_target augassign (yield_expr | star_expressions); + | (star_targets '=' )+ annotated_rhs TYPE_COMMENT? + | single_target augassign annotated_rhs; annotated_rhs: yield_expr | star_expressions; @@ -125,6 +126,15 @@ raise_stmt : 'raise' (expression ('from' expression )?)? ; +pass_stmt + : 'pass'; + +break_stmt + : 'break'; + +continue_stmt + : 'continue'; + global_stmt: 'global' name (',' name)*; nonlocal_stmt: 'nonlocal' name (',' name)*; @@ -156,10 +166,12 @@ import_from_as_names : import_from_as_name (',' import_from_as_name)*; import_from_as_name : name ('as' name )?; + dotted_as_names : dotted_as_name (',' dotted_as_name)*; dotted_as_name : dotted_name ('as' name )?; + dotted_name : dotted_name '.' name | name; @@ -311,10 +323,14 @@ try_stmt // ---------------- except_block - : 'except' (expression ('as' name )?)? ':' block + : 'except' (expression ('as' name )? | expressions)? ':' block ; + + except_star_block - : 'except' '*' expression ('as' name )? ':' block; + : 'except' '*' (expression ('as' name )? | expressions) ':' block + ; + finally_block : 'finally' ':' block; @@ -472,7 +488,8 @@ type_alias // Type parameter declaration // -------------------------- -type_params: '[' type_param_seq ']'; +type_params + : '[' type_param_seq ']'; type_param_seq: type_param (',' type_param)* ','?; @@ -481,8 +498,6 @@ type_param | '*' name type_param_starred_default? | '**' name type_param_default? ; - - type_param_bound: ':' expression; type_param_default: '=' expression; type_param_starred_default: '=' star_expression; @@ -719,8 +734,25 @@ fstring_format_spec fstring : FSTRING_START fstring_middle* FSTRING_END; + + +tstring_format_spec + : TSTRING_MIDDLE + | tstring_replacement_field; +tstring_full_format_spec + : ':' tstring_format_spec*; +tstring_replacement_field + : LBRACE annotated_rhs '='? fstring_conversion? tstring_full_format_spec? RBRACE; +tstring_middle + : tstring_replacement_field + | TSTRING_MIDDLE; +tstring + : TSTRING_START tstring_middle* TSTRING_END; + string: STRING; -strings: (fstring|string)+; +strings + : (fstring|string)+ + |tstring+; list : '[' star_named_expressions? ']'; @@ -875,7 +907,7 @@ func_type_comment : NEWLINE TYPE_COMMENT // Must be followed by indented block | TYPE_COMMENT; -// *** related to soft keywords: https://docs.python.org/3.13/reference/lexical_analysis.html#soft-keywords +// *** related to soft keywords: https://docs.python.org/3.14/reference/lexical_analysis.html#soft-keywords name_except_underscore : NAME // ***** The NAME token can be used only in this rule ***** | NAME_OR_TYPE diff --git a/python/python3_14/README.md b/python/python3_14/README.md new file mode 100644 index 0000000000..b5d4f824b0 --- /dev/null +++ b/python/python3_14/README.md @@ -0,0 +1,23 @@ +# Python 3.14.2 parser + +### About files: +- PythonParser.g4 is the ANTLR4 parser grammar that based on the official [Python PEG grammar](https://docs.python.org/3.14/reference/grammar.html) + +- PythonLexerBase class + - handles the Python indentations + - creates encoding token + - tokenizes fstring literals + - and manage many other things + +- Example files from: [Python 3.14 Standard Lib](https://github.com/python/cpython/tree/3.14/Lib)

+ +### Recent changes: +- parser grammar update for Python 3.14.2 +- tokenizing t-string literals +- tokenizing BOM Unicode character at the start of the file so it is skipped in the token stream +- moved encoding detection from PythonLexerBase to a separate component + +#### [Previous changes](https://github.com/antlr/grammars-v4/blob/master/python/python3_14/changes.md)

+ +### Related link: +[ANTLR4-parser-for-Python-3.14](https://github.com/RobEin/ANTLR4-parser-for-Python-3.14) \ No newline at end of file diff --git a/python/python3_14/TypeScript/PythonLexerBase.ts b/python/python3_14/TypeScript/PythonLexerBase.ts new file mode 100644 index 0000000000..6b112b3409 --- /dev/null +++ b/python/python3_14/TypeScript/PythonLexerBase.ts @@ -0,0 +1,779 @@ +/* +The MIT License (MIT) +Copyright (c) 2021 Robert Einhorn + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + */ + +/* + * + * Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation, + * interpolated strings, and encoding declaration. + * + * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com + * + */ + +import { CharStream, CharStreams, CommonTokenStream, Token, CommonToken, Lexer } from "antlr4"; +import { TokenSource } from "antlr4/src/antlr4/TokenSource.js"; +import PythonLexer from "./PythonLexer.js"; +import PythonParser from "./PythonParser.js"; +import * as Collections from "typescript-collections"; + +export default abstract class PythonLexerBase extends Lexer { + private static readonly LEXER_MODES_FOR_ISTRING_START: Map = new Map(); + private static readonly INVALID_LENGTH: number = -1; + private static readonly ERR_TXT: string = " ERROR: "; + private static readonly TAB_LENGTH: number = 8; + + private encodingName!: string; + + // Indentation handling + private indentLengthStack!: Collections.Stack; + private pendingTokens!: Array; + + private previousPendingTokenType!: number; + private lastPendingTokenTypeFromDefaultChannel!: number; + + // Parenthesis / bracket / brace counts + private opened!: number; + private paren_or_bracket_openedStack!: Array; + private braceExpressionStack!: Array; + private prevBraceExpression!: string; + + // Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE) + private curISTRING_MIDDLEtokenType!: number; + + // We reimplement mode/stack because not all runtimes expose _mode/_modeStack + private curLexerMode!: number; + private lexerModeStack!: Array; + + // Indentation diagnostics + private wasSpaceIndentation!: boolean; + private wasTabIndentation!: boolean; + private wasIndentationMixedWithSpacesAndTabs!: boolean; + + // Current / lookahead tokens + private curToken: Token | undefined; + private ffgToken: Token | undefined; + + protected constructor(input: CharStream) { + super(input); + this.init(); + } + + public reset(): void { + this.init(); + super.reset(); + } + + private init(): void { + this.encodingName = ""; + this.indentLengthStack = new Collections.Stack(); + this.pendingTokens = []; + this.previousPendingTokenType = 0; + this.lastPendingTokenTypeFromDefaultChannel = 0; + this.opened = 0; + this.paren_or_bracket_openedStack = []; + this.braceExpressionStack = []; + this.prevBraceExpression = ""; + this.curISTRING_MIDDLEtokenType = 0; + this.curLexerMode = Lexer.DEFAULT_MODE; + this.lexerModeStack = []; + this.wasSpaceIndentation = false; + this.wasTabIndentation = false; + this.wasIndentationMixedWithSpacesAndTabs = false; + this.curToken = undefined; + this.ffgToken = undefined; + } + + /** + * Sets the encoding name to emit an ENCODING token at the start of the token stream. + * Leave empty if not needed (e.g., when parsing from string). + * + * @param encodingName - The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token. + */ + public setEncodingName(encodingName: string): void { + this.encodingName = encodingName; + } + + public nextToken(): Token { // Reading the input stream until EOF is reached + this.checkNextToken(); + return this.pendingTokens.shift()!; /* .pollFirst() */; // Add the queued token to the token stream + } + + private checkNextToken(): void { + if (this.previousPendingTokenType == PythonLexer.EOF) + return; + + this.setCurrentAndFollowingTokens(); + if (this.indentLengthStack.isEmpty()) { // We're at the first token + this.handleStartOfInput(); + } + + switch (this.curToken!.type) { + case PythonLexer.NEWLINE: + this.handleNEWLINEtoken(); + break; + case PythonLexer.LPAR: + case PythonLexer.LSQB: + case PythonLexer.LBRACE: + this.opened++; + this.addPendingToken(this.curToken!); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + case PythonLexer.RBRACE: + this.opened--; + this.addPendingToken(this.curToken!); + break; + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + this.handleISTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field + this.addPendingToken(this.curToken!); + break; + case PythonLexer.COLONEQUAL: + this.handleCOLONEQUALtokenInIString(); + break; + case PythonLexer.ERRORTOKEN: + this.reportLexerError(`token recognition error at: '${this.curToken!.text}'`); + this.addPendingToken(this.curToken!); + break; + case PythonLexer.EOF: + this.handleEOFtoken(); + break; + default: + this.addPendingToken(this.curToken!); + } + this.handleFORMAT_SPECIFICATION_MODE(); + } + + private setCurrentAndFollowingTokens(): void { + this.curToken = this.ffgToken == undefined + ? super.nextToken() + : this.ffgToken; + + this.checkCurToken(); // Do not use ffgToken in this method or any of its submethods — it hasn't been set yet! + + this.ffgToken = this.curToken!.type === PythonLexer.EOF + ? this.curToken + : super.nextToken(); + } + + // - initialize indent stack + // - skip BOM token + // - insert ENCODING token (if any) + // - hide leading NEWLINE(s) + // - insert leading INDENT if first statement is indented + private handleStartOfInput(): void { + // initialize the stack with a default 0 indentation length + this.indentLengthStack.push(0); // this will never be popped off + + if (this.curToken!.type === PythonLexer.BOM) { + this.setCurrentAndFollowingTokens(); + } + this.insertENCODINGtoken(); + + while (this.curToken!.type !== PythonLexer.EOF) { + if (this.curToken!.channel === Token.DEFAULT_CHANNEL) { + if (this.curToken!.type === PythonLexer.NEWLINE) { + // all the NEWLINE tokens must be ignored before the first statement + this.hideAndAddPendingToken(this.curToken!); + } else { // We're at the first statement + this.insertLeadingIndentToken(); + return; // continue the processing of the current token with checkNextToken() + } + } else { + this.addPendingToken(this.curToken!); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + } + this.setCurrentAndFollowingTokens(); + } // continue the processing of the EOF token with checkNextToken() + } + + private insertENCODINGtoken(): void { // https://peps.python.org/pep-0263/ + if (this.encodingName === '') return; + + const sourcePair = [this as unknown as TokenSource, this._input] as [TokenSource, CharStream]; + const encodingToken: CommonToken = new CommonToken(sourcePair, PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, /*start*/ 0, /*stop*/ 0); + encodingToken.text = this.encodingName; + encodingToken.line = 0; + encodingToken.column = -1; + this.addPendingToken(encodingToken); + } + + private insertLeadingIndentToken(): void { + if (this.previousPendingTokenType === PythonLexer.WS) { + const prevToken: Token = this.pendingTokens.at(-1)!; /* stack peek */ // WS token + if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement + const errMsg: string = "first statement indented"; + this.reportLexerError(errMsg); + // insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser + this.createAndAddPendingToken(PythonLexer.INDENT, PythonLexerBase.ERR_TXT + errMsg, this.curToken!); + } + } + } + + private handleNEWLINEtoken(): void { + if (this.lexerModeStack.length > 0) { // for multi line f/t-string literals + this.addPendingToken(this.curToken!); + return; + } + + if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token + this.hideAndAddPendingToken(this.curToken!); + return; + } + + const nlToken: Token = this.curToken!.clone(); // save the current NEWLINE token + const isLookingAhead: boolean = this.ffgToken!.type === PythonLexer.WS; + if (isLookingAhead) { + this.setCurrentAndFollowingTokens(); // set the next two tokens + } + + switch (this.ffgToken!.type) { + case PythonLexer.NEWLINE: // We're before a blank line + case PythonLexer.COMMENT: // We're before a comment + this.hideAndAddPendingToken(nlToken); + if (isLookingAhead) { + this.addPendingToken(this.curToken!); // WS token + } + break; + default: + this.addPendingToken(nlToken); + if (isLookingAhead) { // We're on whitespace(s) followed by a statement + const indentationLength: number = this.ffgToken!.type === PythonLexer.EOF ? + 0 : + this.getIndentationLength(this.curToken!.text); + + if (indentationLength !== PythonLexerBase.INVALID_LENGTH) { + this.addPendingToken(this.curToken!); // WS token + this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + } else { + this.reportError("inconsistent use of tabs and spaces in indentation"); + } + } else { // We're at a newline followed by a statement (there is no whitespace before the statement) + this.insertIndentOrDedentToken(0); // may insert DEDENT token(s) + } + } + } + + private insertIndentOrDedentToken(indentLength: number): void { + let prevIndentLength: number = this.indentLengthStack.peek()!; + if (indentLength > prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.INDENT, null, this.ffgToken!); + this.indentLengthStack.push(indentLength); + return; + } + + while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream + this.indentLengthStack.pop(); + prevIndentLength = this.indentLengthStack.peek()!; + if (indentLength <= prevIndentLength) { + this.createAndAddPendingToken(PythonLexer.DEDENT, null, this.ffgToken!); + } else { + this.reportError("inconsistent dedent"); + } + } + } + + private checkCurToken(): void { + switch (this.curToken!.type) { + case PythonLexer.FSTRING_START: + this.curISTRING_MIDDLEtokenType = PythonLexer.FSTRING_MIDDLE; + this.setLexerModeByISTRING_STARTtoken(); + return; + case PythonLexer.TSTRING_START: + this.curISTRING_MIDDLEtokenType = PythonLexer.TSTRING_MIDDLE; + this.setLexerModeByISTRING_STARTtoken(); + return; + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + this.handleISTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field + switch (this.curToken!.type) { + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: + return; // No curToken exchange happened + } + break; + case PythonLexer.FSTRING_END: + case PythonLexer.TSTRING_END: + this.popLexerMode(); + return; + default: + if (this.lexerModeStack.length === 0) { + return; // Not in f/t-string mode + } + } + this.processBraceExpression(); + } + + private processBraceExpression(): void { + switch (this.curToken!.type) { // the following tokens can only come from default mode (after an LBRACE in f/t-string) + case PythonLexer.NEWLINE: + // append the current brace expression with the current newline + this.appendToBraceExpression(this.curToken!.text); + this.curToken!.channel = Token.HIDDEN_CHANNEL; + break; + case PythonLexer.LBRACE: + // the outermost brace expression cannot be a dictionary comprehension or a set comprehension + this.braceExpressionStack.push("{"); + this.paren_or_bracket_openedStack.push(0); + this.pushLexerMode(Lexer.DEFAULT_MODE); + break; + case PythonLexer.LPAR: + case PythonLexer.LSQB: + // append the current brace expression with a "(" or a "[" + this.appendToBraceExpression(this.curToken!.text); + // https://peps.python.org/pep-0498/#lambdas-inside-expressions + this.incrementBraceStack(); + break; + case PythonLexer.RPAR: + case PythonLexer.RSQB: + // append the current brace expression with a ")" or a "]" + this.appendToBraceExpression(this.curToken!.text); + this.decrementBraceStack(); + break; + case PythonLexer.COLON: + case PythonLexer.COLONEQUAL: + // append the current brace expression with a ":" or a ":=" + this.appendToBraceExpression(this.curToken!.text); + this.setLexerModeByCOLONorCOLONEQUALtoken(); + break; + case PythonLexer.RBRACE: + this.setLexerModeAfterRBRACEtoken(); + break; + default: + // append the current brace expression with the current token text + this.appendToBraceExpression(this.curToken!.text); + } + } + + private appendToBraceExpression(text: string): void { + const lastIndex: number = this.braceExpressionStack.length - 1; + this.braceExpressionStack[lastIndex] += text; + } + + private incrementBraceStack(): void { // increment the last element (stack peek + 1) + const lastIndex: number = this.paren_or_bracket_openedStack.length - 1; + this.paren_or_bracket_openedStack[lastIndex]!++; + } + + private decrementBraceStack(): void { // decrement the last element (stack peek - 1) + const lastIndex: number = this.paren_or_bracket_openedStack.length - 1; + this.paren_or_bracket_openedStack[lastIndex]!--; + } + + private setLexerModeAfterRBRACEtoken(): void { + switch (this.curLexerMode) { + case Lexer.DEFAULT_MODE: + this.popLexerMode(); + this.popByBRACE(); + break; + case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.popLexerMode(); + this.popLexerMode(); + this.popByBRACE(); + break; + default: + this.reportLexerError("f-string: single '}' is not allowed"); + } + } + + private setLexerModeByISTRING_STARTtoken(): void { // ISTRING = interpolated string (FSTRING or TSTRING) + if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.size === 0) { + PythonLexerBase.initLexerModesForIStringStart(); + } + + const interpolatedStringPrefix: string = this.curToken!.text.toLowerCase(); + if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.has(interpolatedStringPrefix)) { + const newLexerMode: number = PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.get(interpolatedStringPrefix)!; + this.pushLexerMode(newLexerMode); + } else { + this.reportLexerError( + "internal error: unknown interpolated string literal prefix: " + this.curToken!.text + ); + } + } + + private static initLexerModesForIStringStart(): void { + // f-strings + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("f'", PythonLexer.SQ1__FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rf'", PythonLexer.SQ1R_FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("fr'", PythonLexer.SQ1R_FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("f\"", PythonLexer.DQ1__FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rf\"", PythonLexer.DQ1R_FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("fr\"", PythonLexer.DQ1R_FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("f'''", PythonLexer.SQ3__FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rf'''", PythonLexer.SQ3R_FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("fr'''", PythonLexer.SQ3R_FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("f\"\"\"", PythonLexer.DQ3__FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE); + + // t-strings + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("t'", PythonLexer.SQ1__TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rt'", PythonLexer.SQ1R_TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("tr'", PythonLexer.SQ1R_TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("t\"", PythonLexer.DQ1__TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rt\"", PythonLexer.DQ1R_TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("tr\"", PythonLexer.DQ1R_TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("t'''", PythonLexer.SQ3__TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rt'''", PythonLexer.SQ3R_TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("tr'''", PythonLexer.SQ3R_TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("t\"\"\"", PythonLexer.DQ3__TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rt\"\"\"", PythonLexer.DQ3R_TSTRING_MODE); + PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("tr\"\"\"", PythonLexer.DQ3R_TSTRING_MODE); + } + + private setLexerModeByCOLONorCOLONEQUALtoken(): void { + // Exit early when the current lexer mode indicates an open parenthesis/bracket + const opened: boolean = this.paren_or_bracket_openedStack.at(-1)! > 0; /* stack peek */ + if (opened) { + return; + } + + // COLONEQUAL token will be replaced with a COLON token in CheckNextToken() + const prevLexerMode: number = this.lexerModeStack.at(-1)!; /* stack peek */ + switch (prevLexerMode) { + case PythonLexer.SQ1__FSTRING_MODE: + case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ1__TSTRING_MODE: + case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ1R_FSTRING_MODE: + case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ1R_TSTRING_MODE: + case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ1__FSTRING_MODE: + case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ1__TSTRING_MODE: + case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ1R_FSTRING_MODE: + case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ1R_TSTRING_MODE: + case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ3__FSTRING_MODE: + case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ3__TSTRING_MODE: + case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ3R_FSTRING_MODE: + case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.SQ3R_TSTRING_MODE: + case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ3__FSTRING_MODE: + case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ3__TSTRING_MODE: + case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ3R_FSTRING_MODE: + case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); + break; + + case PythonLexer.DQ3R_TSTRING_MODE: + case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE: + this.pushLexerMode(PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); + break; + } + } + + private popByBRACE(): void { + this.paren_or_bracket_openedStack.pop(); + const curBraceExpression: string = this.braceExpressionStack.pop()!; + this.prevBraceExpression = curBraceExpression + "}"; + if (this.braceExpressionStack.length > 0) { + // Extend the current brace expression by adding the previous expression + const lastIndex: number = this.braceExpressionStack.length - 1; + this.braceExpressionStack[lastIndex] += this.prevBraceExpression; + } + } + + private handleISTRING_MIDDLEtokenWithDoubleBrace(): void { // ISTRING = interpolated string (FSTRING or TSTRING) + // Replace the trailing double brace with a single brace and insert a hidden brace token + const lastTwoChars: string = this.getLastTwoCharsOfTheCurTokenText(); + switch (lastTwoChars) { + case "{{": + this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL); + break; + case "}}": + this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL); + break; + } + } + + private handleISTRING_MIDDLEtokenWithQuoteAndLBrace(): void { // ISTRING = interpolated string (FSTRING or TSTRING) + // Replace the trailing quote + left_brace with a quote and insert an LBRACE token + // Replace the trailing backslash + left_brace with a backslash and insert an LBRACE token + const lastTwoChars: string = this.getLastTwoCharsOfTheCurTokenText(); + switch (lastTwoChars) { + case "\"{": + case "'{": + case "\\{": + this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL); + break; + } + } + + private getLastTwoCharsOfTheCurTokenText(): string { + const text = this.curToken!.text; + return text.length <= 2 ? text : text.slice(-2); + } + + private trimLastCharAddPendingTokenSetCurToken(type: number, text: string, channel: number): void { + // Trim the last char and add the modified curToken to the pendingTokens stack + const tokenTextWithoutLastChar: string = this.curToken!.text.slice(0, -1); + this.curToken!.text = tokenTextWithoutLastChar; + this.curToken!.stop -= 1; + this.addPendingToken(this.curToken!); + + this.createNewCurToken(type, text, channel); // Set curToken + } + + private handleCOLONEQUALtokenInIString(): void { // ISTRING = interpolated string (FSTRING or TSTRING) + if (this.lexerModeStack.length > 0 && + this.paren_or_bracket_openedStack.at(-1) === 0) { // stack peek === 0 + + // In an f-string, the walrus operator (:=) is only allowed inside parentheses. + // If used outside, split the COLONEQUAL token into a COLON + // (used as a format specifier instead of a walrus operator), + // and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE). + this.curToken!.type = PythonLexer.COLON; + this.curToken!.text = ":"; + this.curToken!.stop = this.curToken!.start; + + switch (this.ffgToken!.type) { + case PythonLexer.FSTRING_MIDDLE: + case PythonLexer.TSTRING_MIDDLE: { + const token: Token = this.ffgToken!.clone(); + token.text = "=" + token.text; + token.start -= 1; + token.column -= 1; + this.ffgToken = token; + break; + } + default: { + this.addPendingToken(this.curToken!); + this.createNewCurToken(this.curISTRING_MIDDLEtokenType, "=", Token.DEFAULT_CHANNEL); + } + } + + } + this.addPendingToken(this.curToken!); + } + + private createNewCurToken(type: number, text: string, channel: number): void { + const token: CommonToken = this.curToken!.clone(); + token.type = type; + token.text = text; + token.channel = channel; + token.column += 1; + token.start += 1; + token.stop = token.start; + this.curToken = token; + } + + private pushLexerMode(mode: number): void { + this.pushMode(mode); + this.lexerModeStack.push(this.curLexerMode); + this.curLexerMode = mode; + } + + private popLexerMode(): void { + this.popMode(); + this.curLexerMode = this.lexerModeStack.pop()!; + } + + private handleFORMAT_SPECIFICATION_MODE(): void { + if (this.lexerModeStack.length == 0 || this.ffgToken!.type !== PythonLexer.RBRACE) { + return; + } + + // insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification + switch (this.curToken!.type) { + case PythonLexer.COLON: + this.createAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken!); + break; + case PythonLexer.RBRACE: + // only when the previous brace expression is not a dictionary comprehension or set comprehension + if (!this.isValid_DictionaryOrSet_ComprehensionExpression(this.prevBraceExpression)) { + this.createAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken!); + } + break; + } + } + + private isValid_DictionaryOrSet_ComprehensionExpression(code: string): boolean { + const inputStream: CharStream = CharStreams.fromString(code); + const lexer: PythonLexer = new PythonLexer(inputStream); + const tokenStream: CommonTokenStream = new CommonTokenStream(lexer); + let parser = new PythonParser(tokenStream); + + // Disable error listeners to suppress console output + lexer.removeErrorListeners(); + parser.removeErrorListeners(); + + parser.dictcomp(); // Try parsing as dictionary comprehension + if (parser.syntaxErrorsCount === 0) { + return true; + } + + parser = new PythonParser(tokenStream); + (tokenStream as any).seek(0); // seek method is not declared in CommonTokenStream.d.ts + parser.removeErrorListeners(); + parser.setcomp(); // Try parsing as set comprehension + return parser.syntaxErrorsCount === 0; + } + + private insertTrailingTokens(): void { + switch (this.lastPendingTokenTypeFromDefaultChannel) { + case PythonLexer.NEWLINE: + case PythonLexer.DEDENT: + break; // no trailing NEWLINE token is needed + default: + // insert an extra trailing NEWLINE token that serves as the end of the last statement + this.createAndAddPendingToken(PythonLexer.NEWLINE, null, this.ffgToken!); // ffgToken is EOF + } + this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed + } + + private handleEOFtoken(): void { + if (this.lastPendingTokenTypeFromDefaultChannel > 0) { + // there was a statement in the input (leading NEWLINE tokens are hidden) + this.insertTrailingTokens(); + } + this.addPendingToken(this.curToken!); + } + + private hideAndAddPendingToken(originalToken: Token): void { + originalToken.channel = Token.HIDDEN_CHANNEL; + this.addPendingToken(originalToken); + } + + private createAndAddPendingToken(type: number, text: string | null, originalToken: Token): void { + const token: Token = originalToken.clone(); + token.type = type; + token.channel = Token.DEFAULT_CHANNEL; + token.stop = originalToken.start - 1; + token.text = text == null ? + `<${PythonLexer.symbolicNames[type] ?? ""}>` : + text; + + this.addPendingToken(token); + } + + private addPendingToken(token: Token): void { + // save the last pending token type because the pendingTokens list can be empty by the nextToken() + this.previousPendingTokenType = token.type; + if (token.channel === Token.DEFAULT_CHANNEL) { + this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType; + } + this.pendingTokens.push(token) /* .addLast(token) */; + } + + private getIndentationLength(indentText: string): number { // the indentText may contain spaces, tabs or form feeds + let length: number = 0; + for (let ch of indentText) { + switch (ch) { + case " ": + this.wasSpaceIndentation = true; + length += 1; + break; + case "\t": + this.wasTabIndentation = true; + length += PythonLexerBase.TAB_LENGTH - (length % PythonLexerBase.TAB_LENGTH); + break; + case "\f": // form feed + length = 0; + break; + } + } + + if (this.wasTabIndentation && this.wasSpaceIndentation) { + if (!this.wasIndentationMixedWithSpacesAndTabs) { + this.wasIndentationMixedWithSpacesAndTabs = true; + length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent + } + } + return length; + } + + private reportLexerError(errMsg: string): void { + this.getErrorListener().syntaxError(this, this.curToken!.type, this.curToken!.line, this.curToken!.column, " LEXER" + PythonLexerBase.ERR_TXT + errMsg, undefined); + } + + private reportError(errMsg: string): void { + this.reportLexerError(errMsg); + + this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken!); + // the ERRORTOKEN also triggers a parser error + } +} \ No newline at end of file diff --git a/python/python3_13/changes.md b/python/python3_14/changes.md similarity index 82% rename from python/python3_13/changes.md rename to python/python3_14/changes.md index 7934d3757b..b9ceeee61c 100644 --- a/python/python3_13/changes.md +++ b/python/python3_14/changes.md @@ -1,3 +1,9 @@ +# Dec. 24, 2025 +- parser grammar update for Python 3.14.2 +- tokenizing t-string literals +- tokenizing BOM Unicode character at the start of the file so it is skipped in the token stream +- moved encoding detection from PythonLexerBase to a separate component + # Jan. 07, 2025 - parser grammar update for Python 3.13.1

- added ENCODING token

diff --git a/python/python3_13/desc.xml b/python/python3_14/desc.xml similarity index 100% rename from python/python3_13/desc.xml rename to python/python3_14/desc.xml diff --git a/python/python3_13/examples/__future__.py b/python/python3_14/examples/__future__.py similarity index 100% rename from python/python3_13/examples/__future__.py rename to python/python3_14/examples/__future__.py diff --git a/python/python3_13/examples/__hello__.py b/python/python3_14/examples/__hello__.py similarity index 100% rename from python/python3_13/examples/__hello__.py rename to python/python3_14/examples/__hello__.py diff --git a/python/python3_13/examples/_aix_support.py b/python/python3_14/examples/_aix_support.py similarity index 100% rename from python/python3_13/examples/_aix_support.py rename to python/python3_14/examples/_aix_support.py diff --git a/python/python3_13/examples/_android_support.py b/python/python3_14/examples/_android_support.py similarity index 93% rename from python/python3_13/examples/_android_support.py rename to python/python3_14/examples/_android_support.py index 7572745c85..a439d03a14 100644 --- a/python/python3_13/examples/_android_support.py +++ b/python/python3_14/examples/_android_support.py @@ -6,7 +6,7 @@ # The maximum length of a log message in bytes, including the level marker and # tag, is defined as LOGGER_ENTRY_MAX_PAYLOAD at # https://cs.android.com/android/platform/superproject/+/android-14.0.0_r1:system/logging/liblog/include/log/log.h;l=71. -# Messages longer than this will be be truncated by logcat. This limit has already +# Messages longer than this will be truncated by logcat. This limit has already # been reduced at least once in the history of Android (from 4076 to 4068 between # API level 23 and 26), so leave some headroom. MAX_BYTES_PER_WRITE = 4000 @@ -29,15 +29,19 @@ def init_streams(android_log_write, stdout_prio, stderr_prio): global logcat logcat = Logcat(android_log_write) - - sys.stdout = TextLogStream( - stdout_prio, "python.stdout", sys.stdout.fileno()) - sys.stderr = TextLogStream( - stderr_prio, "python.stderr", sys.stderr.fileno()) + sys.stdout = TextLogStream(stdout_prio, "python.stdout", sys.stdout) + sys.stderr = TextLogStream(stderr_prio, "python.stderr", sys.stderr) class TextLogStream(io.TextIOWrapper): - def __init__(self, prio, tag, fileno=None, **kwargs): + def __init__(self, prio, tag, original=None, **kwargs): + # Respect the -u option. + if original: + kwargs.setdefault("write_through", original.write_through) + fileno = original.fileno() + else: + fileno = None + # The default is surrogateescape for stdout and backslashreplace for # stderr, but in the context of an Android log, readability is more # important than reversibility. diff --git a/python/python3_14/examples/_apple_support.py b/python/python3_14/examples/_apple_support.py new file mode 100644 index 0000000000..92febdcf58 --- /dev/null +++ b/python/python3_14/examples/_apple_support.py @@ -0,0 +1,66 @@ +import io +import sys + + +def init_streams(log_write, stdout_level, stderr_level): + # Redirect stdout and stderr to the Apple system log. This method is + # invoked by init_apple_streams() (initconfig.c) if config->use_system_logger + # is enabled. + sys.stdout = SystemLog(log_write, stdout_level, errors=sys.stderr.errors) + sys.stderr = SystemLog(log_write, stderr_level, errors=sys.stderr.errors) + + +class SystemLog(io.TextIOWrapper): + def __init__(self, log_write, level, **kwargs): + kwargs.setdefault("encoding", "UTF-8") + kwargs.setdefault("line_buffering", True) + super().__init__(LogStream(log_write, level), **kwargs) + + def __repr__(self): + return f"" + + def write(self, s): + if not isinstance(s, str): + raise TypeError( + f"write() argument must be str, not {type(s).__name__}") + + # In case `s` is a str subclass that writes itself to stdout or stderr + # when we call its methods, convert it to an actual str. + s = str.__str__(s) + + # We want to emit one log message per line, so split + # the string before sending it to the superclass. + for line in s.splitlines(keepends=True): + super().write(line) + + return len(s) + + +class LogStream(io.RawIOBase): + def __init__(self, log_write, level): + self.log_write = log_write + self.level = level + + def __repr__(self): + return f"" + + def writable(self): + return True + + def write(self, b): + if type(b) is not bytes: + try: + b = bytes(memoryview(b)) + except TypeError: + raise TypeError( + f"write() argument must be bytes-like, not {type(b).__name__}" + ) from None + + # Writing an empty string to the stream should have no effect. + if b: + # Encode null bytes using "modified UTF-8" to avoid truncating the + # message. This should not affect the return value, as the caller + # may be expecting it to match the length of the input. + self.log_write(self.level, b.replace(b"\x00", b"\xc0\x80")) + + return len(b) diff --git a/python/python3_14/examples/_ast_unparse.py b/python/python3_14/examples/_ast_unparse.py new file mode 100644 index 0000000000..1c8741b5a5 --- /dev/null +++ b/python/python3_14/examples/_ast_unparse.py @@ -0,0 +1,1161 @@ +# This module contains ``ast.unparse()``, defined here +# to improve the import time for the ``ast`` module. +import sys +from _ast import * +from ast import NodeVisitor +from contextlib import contextmanager, nullcontext +from enum import IntEnum, auto, _simple_enum + +# Large float and imaginary literals get turned into infinities in the AST. +# We unparse those infinities to INFSTR. +_INFSTR = "1e" + repr(sys.float_info.max_10_exp + 1) + +@_simple_enum(IntEnum) +class _Precedence: + """Precedence table that originated from python grammar.""" + + NAMED_EXPR = auto() # := + TUPLE = auto() # , + YIELD = auto() # 'yield', 'yield from' + TEST = auto() # 'if'-'else', 'lambda' + OR = auto() # 'or' + AND = auto() # 'and' + NOT = auto() # 'not' + CMP = auto() # '<', '>', '==', '>=', '<=', '!=', + # 'in', 'not in', 'is', 'is not' + EXPR = auto() + BOR = EXPR # '|' + BXOR = auto() # '^' + BAND = auto() # '&' + SHIFT = auto() # '<<', '>>' + ARITH = auto() # '+', '-' + TERM = auto() # '*', '@', '/', '%', '//' + FACTOR = auto() # unary '+', '-', '~' + POWER = auto() # '**' + AWAIT = auto() # 'await' + ATOM = auto() + + def next(self): + try: + return self.__class__(self + 1) + except ValueError: + return self + + +_SINGLE_QUOTES = ("'", '"') +_MULTI_QUOTES = ('"""', "'''") +_ALL_QUOTES = (*_SINGLE_QUOTES, *_MULTI_QUOTES) + +class Unparser(NodeVisitor): + """Methods in this class recursively traverse an AST and + output source code for the abstract syntax; original formatting + is disregarded.""" + + def __init__(self): + self._source = [] + self._precedences = {} + self._type_ignores = {} + self._indent = 0 + self._in_try_star = False + self._in_interactive = False + + def interleave(self, inter, f, seq): + """Call f on each item in seq, calling inter() in between.""" + seq = iter(seq) + try: + f(next(seq)) + except StopIteration: + pass + else: + for x in seq: + inter() + f(x) + + def items_view(self, traverser, items): + """Traverse and separate the given *items* with a comma and append it to + the buffer. If *items* is a single item sequence, a trailing comma + will be added.""" + if len(items) == 1: + traverser(items[0]) + self.write(",") + else: + self.interleave(lambda: self.write(", "), traverser, items) + + def maybe_newline(self): + """Adds a newline if it isn't the start of generated source""" + if self._source: + self.write("\n") + + def maybe_semicolon(self): + """Adds a "; " delimiter if it isn't the start of generated source""" + if self._source: + self.write("; ") + + def fill(self, text="", *, allow_semicolon=True): + """Indent a piece of text and append it, according to the current + indentation level, or only delineate with semicolon if applicable""" + if self._in_interactive and not self._indent and allow_semicolon: + self.maybe_semicolon() + self.write(text) + else: + self.maybe_newline() + self.write(" " * self._indent + text) + + def write(self, *text): + """Add new source parts""" + self._source.extend(text) + + @contextmanager + def buffered(self, buffer = None): + if buffer is None: + buffer = [] + + original_source = self._source + self._source = buffer + yield buffer + self._source = original_source + + @contextmanager + def block(self, *, extra = None): + """A context manager for preparing the source for blocks. It adds + the character':', increases the indentation on enter and decreases + the indentation on exit. If *extra* is given, it will be directly + appended after the colon character. + """ + self.write(":") + if extra: + self.write(extra) + self._indent += 1 + yield + self._indent -= 1 + + @contextmanager + def delimit(self, start, end): + """A context manager for preparing the source for expressions. It adds + *start* to the buffer and enters, after exit it adds *end*.""" + + self.write(start) + yield + self.write(end) + + def delimit_if(self, start, end, condition): + if condition: + return self.delimit(start, end) + else: + return nullcontext() + + def require_parens(self, precedence, node): + """Shortcut to adding precedence related parens""" + return self.delimit_if("(", ")", self.get_precedence(node) > precedence) + + def get_precedence(self, node): + return self._precedences.get(node, _Precedence.TEST) + + def set_precedence(self, precedence, *nodes): + for node in nodes: + self._precedences[node] = precedence + + def get_raw_docstring(self, node): + """If a docstring node is found in the body of the *node* parameter, + return that docstring node, None otherwise. + + Logic mirrored from ``_PyAST_GetDocString``.""" + if not isinstance( + node, (AsyncFunctionDef, FunctionDef, ClassDef, Module) + ) or len(node.body) < 1: + return None + node = node.body[0] + if not isinstance(node, Expr): + return None + node = node.value + if isinstance(node, Constant) and isinstance(node.value, str): + return node + + def get_type_comment(self, node): + comment = self._type_ignores.get(node.lineno) or node.type_comment + if comment is not None: + return f" # type: {comment}" + + def traverse(self, node): + if isinstance(node, list): + for item in node: + self.traverse(item) + else: + super().visit(node) + + # Note: as visit() resets the output text, do NOT rely on + # NodeVisitor.generic_visit to handle any nodes (as it calls back in to + # the subclass visit() method, which resets self._source to an empty list) + def visit(self, node): + """Outputs a source code string that, if converted back to an ast + (using ast.parse) will generate an AST equivalent to *node*""" + self._source = [] + self.traverse(node) + return "".join(self._source) + + def _write_docstring_and_traverse_body(self, node): + if (docstring := self.get_raw_docstring(node)): + self._write_docstring(docstring) + self.traverse(node.body[1:]) + else: + self.traverse(node.body) + + def visit_Module(self, node): + self._type_ignores = { + ignore.lineno: f"ignore{ignore.tag}" + for ignore in node.type_ignores + } + try: + self._write_docstring_and_traverse_body(node) + finally: + self._type_ignores.clear() + + def visit_Interactive(self, node): + self._in_interactive = True + try: + self._write_docstring_and_traverse_body(node) + finally: + self._in_interactive = False + + def visit_FunctionType(self, node): + with self.delimit("(", ")"): + self.interleave( + lambda: self.write(", "), self.traverse, node.argtypes + ) + + self.write(" -> ") + self.traverse(node.returns) + + def visit_Expr(self, node): + self.fill() + self.set_precedence(_Precedence.YIELD, node.value) + self.traverse(node.value) + + def visit_NamedExpr(self, node): + with self.require_parens(_Precedence.NAMED_EXPR, node): + self.set_precedence(_Precedence.ATOM, node.target, node.value) + self.traverse(node.target) + self.write(" := ") + self.traverse(node.value) + + def visit_Import(self, node): + self.fill("import ") + self.interleave(lambda: self.write(", "), self.traverse, node.names) + + def visit_ImportFrom(self, node): + self.fill("from ") + self.write("." * (node.level or 0)) + if node.module: + self.write(node.module) + self.write(" import ") + self.interleave(lambda: self.write(", "), self.traverse, node.names) + + def visit_Assign(self, node): + self.fill() + for target in node.targets: + self.set_precedence(_Precedence.TUPLE, target) + self.traverse(target) + self.write(" = ") + self.traverse(node.value) + if type_comment := self.get_type_comment(node): + self.write(type_comment) + + def visit_AugAssign(self, node): + self.fill() + self.traverse(node.target) + self.write(" " + self.binop[node.op.__class__.__name__] + "= ") + self.traverse(node.value) + + def visit_AnnAssign(self, node): + self.fill() + with self.delimit_if("(", ")", not node.simple and isinstance(node.target, Name)): + self.traverse(node.target) + self.write(": ") + self.traverse(node.annotation) + if node.value: + self.write(" = ") + self.traverse(node.value) + + def visit_Return(self, node): + self.fill("return") + if node.value: + self.write(" ") + self.traverse(node.value) + + def visit_Pass(self, node): + self.fill("pass") + + def visit_Break(self, node): + self.fill("break") + + def visit_Continue(self, node): + self.fill("continue") + + def visit_Delete(self, node): + self.fill("del ") + self.interleave(lambda: self.write(", "), self.traverse, node.targets) + + def visit_Assert(self, node): + self.fill("assert ") + self.traverse(node.test) + if node.msg: + self.write(", ") + self.traverse(node.msg) + + def visit_Global(self, node): + self.fill("global ") + self.interleave(lambda: self.write(", "), self.write, node.names) + + def visit_Nonlocal(self, node): + self.fill("nonlocal ") + self.interleave(lambda: self.write(", "), self.write, node.names) + + def visit_Await(self, node): + with self.require_parens(_Precedence.AWAIT, node): + self.write("await") + if node.value: + self.write(" ") + self.set_precedence(_Precedence.ATOM, node.value) + self.traverse(node.value) + + def visit_Yield(self, node): + with self.require_parens(_Precedence.YIELD, node): + self.write("yield") + if node.value: + self.write(" ") + self.set_precedence(_Precedence.ATOM, node.value) + self.traverse(node.value) + + def visit_YieldFrom(self, node): + with self.require_parens(_Precedence.YIELD, node): + self.write("yield from ") + if not node.value: + raise ValueError("Node can't be used without a value attribute.") + self.set_precedence(_Precedence.ATOM, node.value) + self.traverse(node.value) + + def visit_Raise(self, node): + self.fill("raise") + if not node.exc: + if node.cause: + raise ValueError(f"Node can't use cause without an exception.") + return + self.write(" ") + self.traverse(node.exc) + if node.cause: + self.write(" from ") + self.traverse(node.cause) + + def do_visit_try(self, node): + self.fill("try", allow_semicolon=False) + with self.block(): + self.traverse(node.body) + for ex in node.handlers: + self.traverse(ex) + if node.orelse: + self.fill("else", allow_semicolon=False) + with self.block(): + self.traverse(node.orelse) + if node.finalbody: + self.fill("finally", allow_semicolon=False) + with self.block(): + self.traverse(node.finalbody) + + def visit_Try(self, node): + prev_in_try_star = self._in_try_star + try: + self._in_try_star = False + self.do_visit_try(node) + finally: + self._in_try_star = prev_in_try_star + + def visit_TryStar(self, node): + prev_in_try_star = self._in_try_star + try: + self._in_try_star = True + self.do_visit_try(node) + finally: + self._in_try_star = prev_in_try_star + + def visit_ExceptHandler(self, node): + self.fill("except*" if self._in_try_star else "except", allow_semicolon=False) + if node.type: + self.write(" ") + self.traverse(node.type) + if node.name: + self.write(" as ") + self.write(node.name) + with self.block(): + self.traverse(node.body) + + def visit_ClassDef(self, node): + self.maybe_newline() + for deco in node.decorator_list: + self.fill("@", allow_semicolon=False) + self.traverse(deco) + self.fill("class " + node.name, allow_semicolon=False) + if hasattr(node, "type_params"): + self._type_params_helper(node.type_params) + with self.delimit_if("(", ")", condition = node.bases or node.keywords): + comma = False + for e in node.bases: + if comma: + self.write(", ") + else: + comma = True + self.traverse(e) + for e in node.keywords: + if comma: + self.write(", ") + else: + comma = True + self.traverse(e) + + with self.block(): + self._write_docstring_and_traverse_body(node) + + def visit_FunctionDef(self, node): + self._function_helper(node, "def") + + def visit_AsyncFunctionDef(self, node): + self._function_helper(node, "async def") + + def _function_helper(self, node, fill_suffix): + self.maybe_newline() + for deco in node.decorator_list: + self.fill("@", allow_semicolon=False) + self.traverse(deco) + def_str = fill_suffix + " " + node.name + self.fill(def_str, allow_semicolon=False) + if hasattr(node, "type_params"): + self._type_params_helper(node.type_params) + with self.delimit("(", ")"): + self.traverse(node.args) + if node.returns: + self.write(" -> ") + self.traverse(node.returns) + with self.block(extra=self.get_type_comment(node)): + self._write_docstring_and_traverse_body(node) + + def _type_params_helper(self, type_params): + if type_params is not None and len(type_params) > 0: + with self.delimit("[", "]"): + self.interleave(lambda: self.write(", "), self.traverse, type_params) + + def visit_TypeVar(self, node): + self.write(node.name) + if node.bound: + self.write(": ") + self.traverse(node.bound) + if node.default_value: + self.write(" = ") + self.traverse(node.default_value) + + def visit_TypeVarTuple(self, node): + self.write("*" + node.name) + if node.default_value: + self.write(" = ") + self.traverse(node.default_value) + + def visit_ParamSpec(self, node): + self.write("**" + node.name) + if node.default_value: + self.write(" = ") + self.traverse(node.default_value) + + def visit_TypeAlias(self, node): + self.fill("type ") + self.traverse(node.name) + self._type_params_helper(node.type_params) + self.write(" = ") + self.traverse(node.value) + + def visit_For(self, node): + self._for_helper("for ", node) + + def visit_AsyncFor(self, node): + self._for_helper("async for ", node) + + def _for_helper(self, fill, node): + self.fill(fill, allow_semicolon=False) + self.set_precedence(_Precedence.TUPLE, node.target) + self.traverse(node.target) + self.write(" in ") + self.traverse(node.iter) + with self.block(extra=self.get_type_comment(node)): + self.traverse(node.body) + if node.orelse: + self.fill("else", allow_semicolon=False) + with self.block(): + self.traverse(node.orelse) + + def visit_If(self, node): + self.fill("if ", allow_semicolon=False) + self.traverse(node.test) + with self.block(): + self.traverse(node.body) + # collapse nested ifs into equivalent elifs. + while node.orelse and len(node.orelse) == 1 and isinstance(node.orelse[0], If): + node = node.orelse[0] + self.fill("elif ", allow_semicolon=False) + self.traverse(node.test) + with self.block(): + self.traverse(node.body) + # final else + if node.orelse: + self.fill("else", allow_semicolon=False) + with self.block(): + self.traverse(node.orelse) + + def visit_While(self, node): + self.fill("while ", allow_semicolon=False) + self.traverse(node.test) + with self.block(): + self.traverse(node.body) + if node.orelse: + self.fill("else", allow_semicolon=False) + with self.block(): + self.traverse(node.orelse) + + def visit_With(self, node): + self.fill("with ", allow_semicolon=False) + self.interleave(lambda: self.write(", "), self.traverse, node.items) + with self.block(extra=self.get_type_comment(node)): + self.traverse(node.body) + + def visit_AsyncWith(self, node): + self.fill("async with ", allow_semicolon=False) + self.interleave(lambda: self.write(", "), self.traverse, node.items) + with self.block(extra=self.get_type_comment(node)): + self.traverse(node.body) + + def _str_literal_helper( + self, string, *, quote_types=_ALL_QUOTES, escape_special_whitespace=False + ): + """Helper for writing string literals, minimizing escapes. + Returns the tuple (string literal to write, possible quote types). + """ + def escape_char(c): + # \n and \t are non-printable, but we only escape them if + # escape_special_whitespace is True + if not escape_special_whitespace and c in "\n\t": + return c + # Always escape backslashes and other non-printable characters + if c == "\\" or not c.isprintable(): + return c.encode("unicode_escape").decode("ascii") + return c + + escaped_string = "".join(map(escape_char, string)) + possible_quotes = quote_types + if "\n" in escaped_string: + possible_quotes = [q for q in possible_quotes if q in _MULTI_QUOTES] + possible_quotes = [q for q in possible_quotes if q not in escaped_string] + if not possible_quotes: + # If there aren't any possible_quotes, fallback to using repr + # on the original string. Try to use a quote from quote_types, + # e.g., so that we use triple quotes for docstrings. + string = repr(string) + quote = next((q for q in quote_types if string[0] in q), string[0]) + return string[1:-1], [quote] + if escaped_string: + # Sort so that we prefer '''"''' over """\"""" + possible_quotes.sort(key=lambda q: q[0] == escaped_string[-1]) + # If we're using triple quotes and we'd need to escape a final + # quote, escape it + if possible_quotes[0][0] == escaped_string[-1]: + assert len(possible_quotes[0]) == 3 + escaped_string = escaped_string[:-1] + "\\" + escaped_string[-1] + return escaped_string, possible_quotes + + def _write_str_avoiding_backslashes(self, string, *, quote_types=_ALL_QUOTES): + """Write string literal value with a best effort attempt to avoid backslashes.""" + string, quote_types = self._str_literal_helper(string, quote_types=quote_types) + quote_type = quote_types[0] + self.write(f"{quote_type}{string}{quote_type}") + + def _ftstring_helper(self, parts): + new_parts = [] + quote_types = list(_ALL_QUOTES) + fallback_to_repr = False + for value, is_constant in parts: + if is_constant: + value, new_quote_types = self._str_literal_helper( + value, + quote_types=quote_types, + escape_special_whitespace=True, + ) + if set(new_quote_types).isdisjoint(quote_types): + fallback_to_repr = True + break + quote_types = new_quote_types + else: + if "\n" in value: + quote_types = [q for q in quote_types if q in _MULTI_QUOTES] + assert quote_types + + new_quote_types = [q for q in quote_types if q not in value] + if new_quote_types: + quote_types = new_quote_types + new_parts.append(value) + + if fallback_to_repr: + # If we weren't able to find a quote type that works for all parts + # of the JoinedStr, fallback to using repr and triple single quotes. + quote_types = ["'''"] + new_parts.clear() + for value, is_constant in parts: + if is_constant: + value = repr('"' + value) # force repr to use single quotes + expected_prefix = "'\"" + assert value.startswith(expected_prefix), repr(value) + value = value[len(expected_prefix):-1] + new_parts.append(value) + + value = "".join(new_parts) + quote_type = quote_types[0] + self.write(f"{quote_type}{value}{quote_type}") + + def _write_ftstring(self, values, prefix): + self.write(prefix) + fstring_parts = [] + for value in values: + with self.buffered() as buffer: + self._write_ftstring_inner(value) + fstring_parts.append( + ("".join(buffer), isinstance(value, Constant)) + ) + self._ftstring_helper(fstring_parts) + + def visit_JoinedStr(self, node): + self._write_ftstring(node.values, "f") + + def visit_TemplateStr(self, node): + self._write_ftstring(node.values, "t") + + def _write_ftstring_inner(self, node, is_format_spec=False): + if isinstance(node, JoinedStr): + # for both the f-string itself, and format_spec + for value in node.values: + self._write_ftstring_inner(value, is_format_spec=is_format_spec) + elif isinstance(node, Constant) and isinstance(node.value, str): + value = node.value.replace("{", "{{").replace("}", "}}") + + if is_format_spec: + value = value.replace("\\", "\\\\") + value = value.replace("'", "\\'") + value = value.replace('"', '\\"') + value = value.replace("\n", "\\n") + self.write(value) + elif isinstance(node, FormattedValue): + self.visit_FormattedValue(node) + elif isinstance(node, Interpolation): + self.visit_Interpolation(node) + else: + raise ValueError(f"Unexpected node inside JoinedStr, {node!r}") + + def _unparse_interpolation_value(self, inner): + unparser = type(self)() + unparser.set_precedence(_Precedence.TEST.next(), inner) + return unparser.visit(inner) + + def _write_interpolation(self, node, use_str_attr=False): + with self.delimit("{", "}"): + if use_str_attr: + expr = node.str + else: + expr = self._unparse_interpolation_value(node.value) + if expr.startswith("{"): + # Separate pair of opening brackets as "{ {" + self.write(" ") + self.write(expr) + if node.conversion != -1: + self.write(f"!{chr(node.conversion)}") + if node.format_spec: + self.write(":") + self._write_ftstring_inner(node.format_spec, is_format_spec=True) + + def visit_FormattedValue(self, node): + self._write_interpolation(node) + + def visit_Interpolation(self, node): + # If `str` is set to `None`, use the `value` to generate the source code. + self._write_interpolation(node, use_str_attr=node.str is not None) + + def visit_Name(self, node): + self.write(node.id) + + def _write_docstring(self, node): + self.fill(allow_semicolon=False) + if node.kind == "u": + self.write("u") + self._write_str_avoiding_backslashes(node.value, quote_types=_MULTI_QUOTES) + + def _write_constant(self, value): + if isinstance(value, (float, complex)): + # Substitute overflowing decimal literal for AST infinities, + # and inf - inf for NaNs. + self.write( + repr(value) + .replace("inf", _INFSTR) + .replace("nan", f"({_INFSTR}-{_INFSTR})") + ) + else: + self.write(repr(value)) + + def visit_Constant(self, node): + value = node.value + if isinstance(value, tuple): + with self.delimit("(", ")"): + self.items_view(self._write_constant, value) + elif value is ...: + self.write("...") + else: + if node.kind == "u": + self.write("u") + self._write_constant(node.value) + + def visit_List(self, node): + with self.delimit("[", "]"): + self.interleave(lambda: self.write(", "), self.traverse, node.elts) + + def visit_ListComp(self, node): + with self.delimit("[", "]"): + self.traverse(node.elt) + for gen in node.generators: + self.traverse(gen) + + def visit_GeneratorExp(self, node): + with self.delimit("(", ")"): + self.traverse(node.elt) + for gen in node.generators: + self.traverse(gen) + + def visit_SetComp(self, node): + with self.delimit("{", "}"): + self.traverse(node.elt) + for gen in node.generators: + self.traverse(gen) + + def visit_DictComp(self, node): + with self.delimit("{", "}"): + self.traverse(node.key) + self.write(": ") + self.traverse(node.value) + for gen in node.generators: + self.traverse(gen) + + def visit_comprehension(self, node): + if node.is_async: + self.write(" async for ") + else: + self.write(" for ") + self.set_precedence(_Precedence.TUPLE, node.target) + self.traverse(node.target) + self.write(" in ") + self.set_precedence(_Precedence.TEST.next(), node.iter, *node.ifs) + self.traverse(node.iter) + for if_clause in node.ifs: + self.write(" if ") + self.traverse(if_clause) + + def visit_IfExp(self, node): + with self.require_parens(_Precedence.TEST, node): + self.set_precedence(_Precedence.TEST.next(), node.body, node.test) + self.traverse(node.body) + self.write(" if ") + self.traverse(node.test) + self.write(" else ") + self.set_precedence(_Precedence.TEST, node.orelse) + self.traverse(node.orelse) + + def visit_Set(self, node): + if node.elts: + with self.delimit("{", "}"): + self.interleave(lambda: self.write(", "), self.traverse, node.elts) + else: + # `{}` would be interpreted as a dictionary literal, and + # `set` might be shadowed. Thus: + self.write('{*()}') + + def visit_Dict(self, node): + def write_key_value_pair(k, v): + self.traverse(k) + self.write(": ") + self.traverse(v) + + def write_item(item): + k, v = item + if k is None: + # for dictionary unpacking operator in dicts {**{'y': 2}} + # see PEP 448 for details + self.write("**") + self.set_precedence(_Precedence.EXPR, v) + self.traverse(v) + else: + write_key_value_pair(k, v) + + with self.delimit("{", "}"): + self.interleave( + lambda: self.write(", "), write_item, zip(node.keys, node.values) + ) + + def visit_Tuple(self, node): + with self.delimit_if( + "(", + ")", + len(node.elts) == 0 or self.get_precedence(node) > _Precedence.TUPLE + ): + self.items_view(self.traverse, node.elts) + + unop = {"Invert": "~", "Not": "not", "UAdd": "+", "USub": "-"} + unop_precedence = { + "not": _Precedence.NOT, + "~": _Precedence.FACTOR, + "+": _Precedence.FACTOR, + "-": _Precedence.FACTOR, + } + + def visit_UnaryOp(self, node): + operator = self.unop[node.op.__class__.__name__] + operator_precedence = self.unop_precedence[operator] + with self.require_parens(operator_precedence, node): + self.write(operator) + # factor prefixes (+, -, ~) shouldn't be separated + # from the value they belong, (e.g: +1 instead of + 1) + if operator_precedence is not _Precedence.FACTOR: + self.write(" ") + self.set_precedence(operator_precedence, node.operand) + self.traverse(node.operand) + + binop = { + "Add": "+", + "Sub": "-", + "Mult": "*", + "MatMult": "@", + "Div": "/", + "Mod": "%", + "LShift": "<<", + "RShift": ">>", + "BitOr": "|", + "BitXor": "^", + "BitAnd": "&", + "FloorDiv": "//", + "Pow": "**", + } + + binop_precedence = { + "+": _Precedence.ARITH, + "-": _Precedence.ARITH, + "*": _Precedence.TERM, + "@": _Precedence.TERM, + "/": _Precedence.TERM, + "%": _Precedence.TERM, + "<<": _Precedence.SHIFT, + ">>": _Precedence.SHIFT, + "|": _Precedence.BOR, + "^": _Precedence.BXOR, + "&": _Precedence.BAND, + "//": _Precedence.TERM, + "**": _Precedence.POWER, + } + + binop_rassoc = frozenset(("**",)) + def visit_BinOp(self, node): + operator = self.binop[node.op.__class__.__name__] + operator_precedence = self.binop_precedence[operator] + with self.require_parens(operator_precedence, node): + if operator in self.binop_rassoc: + left_precedence = operator_precedence.next() + right_precedence = operator_precedence + else: + left_precedence = operator_precedence + right_precedence = operator_precedence.next() + + self.set_precedence(left_precedence, node.left) + self.traverse(node.left) + self.write(f" {operator} ") + self.set_precedence(right_precedence, node.right) + self.traverse(node.right) + + cmpops = { + "Eq": "==", + "NotEq": "!=", + "Lt": "<", + "LtE": "<=", + "Gt": ">", + "GtE": ">=", + "Is": "is", + "IsNot": "is not", + "In": "in", + "NotIn": "not in", + } + + def visit_Compare(self, node): + with self.require_parens(_Precedence.CMP, node): + self.set_precedence(_Precedence.CMP.next(), node.left, *node.comparators) + self.traverse(node.left) + for o, e in zip(node.ops, node.comparators): + self.write(" " + self.cmpops[o.__class__.__name__] + " ") + self.traverse(e) + + boolops = {"And": "and", "Or": "or"} + boolop_precedence = {"and": _Precedence.AND, "or": _Precedence.OR} + + def visit_BoolOp(self, node): + operator = self.boolops[node.op.__class__.__name__] + operator_precedence = self.boolop_precedence[operator] + + def increasing_level_traverse(node): + nonlocal operator_precedence + operator_precedence = operator_precedence.next() + self.set_precedence(operator_precedence, node) + self.traverse(node) + + with self.require_parens(operator_precedence, node): + s = f" {operator} " + self.interleave(lambda: self.write(s), increasing_level_traverse, node.values) + + def visit_Attribute(self, node): + self.set_precedence(_Precedence.ATOM, node.value) + self.traverse(node.value) + # Special case: 3.__abs__() is a syntax error, so if node.value + # is an integer literal then we need to either parenthesize + # it or add an extra space to get 3 .__abs__(). + if isinstance(node.value, Constant) and isinstance(node.value.value, int): + self.write(" ") + self.write(".") + self.write(node.attr) + + def visit_Call(self, node): + self.set_precedence(_Precedence.ATOM, node.func) + self.traverse(node.func) + with self.delimit("(", ")"): + comma = False + for e in node.args: + if comma: + self.write(", ") + else: + comma = True + self.traverse(e) + for e in node.keywords: + if comma: + self.write(", ") + else: + comma = True + self.traverse(e) + + def visit_Subscript(self, node): + def is_non_empty_tuple(slice_value): + return ( + isinstance(slice_value, Tuple) + and slice_value.elts + ) + + self.set_precedence(_Precedence.ATOM, node.value) + self.traverse(node.value) + with self.delimit("[", "]"): + if is_non_empty_tuple(node.slice): + # parentheses can be omitted if the tuple isn't empty + self.items_view(self.traverse, node.slice.elts) + else: + self.traverse(node.slice) + + def visit_Starred(self, node): + self.write("*") + self.set_precedence(_Precedence.EXPR, node.value) + self.traverse(node.value) + + def visit_Ellipsis(self, node): + self.write("...") + + def visit_Slice(self, node): + if node.lower: + self.traverse(node.lower) + self.write(":") + if node.upper: + self.traverse(node.upper) + if node.step: + self.write(":") + self.traverse(node.step) + + def visit_Match(self, node): + self.fill("match ", allow_semicolon=False) + self.traverse(node.subject) + with self.block(): + for case in node.cases: + self.traverse(case) + + def visit_arg(self, node): + self.write(node.arg) + if node.annotation: + self.write(": ") + self.traverse(node.annotation) + + def visit_arguments(self, node): + first = True + # normal arguments + all_args = node.posonlyargs + node.args + defaults = [None] * (len(all_args) - len(node.defaults)) + node.defaults + for index, elements in enumerate(zip(all_args, defaults), 1): + a, d = elements + if first: + first = False + else: + self.write(", ") + self.traverse(a) + if d: + self.write("=") + self.traverse(d) + if index == len(node.posonlyargs): + self.write(", /") + + # varargs, or bare '*' if no varargs but keyword-only arguments present + if node.vararg or node.kwonlyargs: + if first: + first = False + else: + self.write(", ") + self.write("*") + if node.vararg: + self.write(node.vararg.arg) + if node.vararg.annotation: + self.write(": ") + self.traverse(node.vararg.annotation) + + # keyword-only arguments + if node.kwonlyargs: + for a, d in zip(node.kwonlyargs, node.kw_defaults): + self.write(", ") + self.traverse(a) + if d: + self.write("=") + self.traverse(d) + + # kwargs + if node.kwarg: + if first: + first = False + else: + self.write(", ") + self.write("**" + node.kwarg.arg) + if node.kwarg.annotation: + self.write(": ") + self.traverse(node.kwarg.annotation) + + def visit_keyword(self, node): + if node.arg is None: + self.write("**") + else: + self.write(node.arg) + self.write("=") + self.traverse(node.value) + + def visit_Lambda(self, node): + with self.require_parens(_Precedence.TEST, node): + self.write("lambda") + with self.buffered() as buffer: + self.traverse(node.args) + if buffer: + self.write(" ", *buffer) + self.write(": ") + self.set_precedence(_Precedence.TEST, node.body) + self.traverse(node.body) + + def visit_alias(self, node): + self.write(node.name) + if node.asname: + self.write(" as " + node.asname) + + def visit_withitem(self, node): + self.traverse(node.context_expr) + if node.optional_vars: + self.write(" as ") + self.traverse(node.optional_vars) + + def visit_match_case(self, node): + self.fill("case ", allow_semicolon=False) + self.traverse(node.pattern) + if node.guard: + self.write(" if ") + self.traverse(node.guard) + with self.block(): + self.traverse(node.body) + + def visit_MatchValue(self, node): + self.traverse(node.value) + + def visit_MatchSingleton(self, node): + self._write_constant(node.value) + + def visit_MatchSequence(self, node): + with self.delimit("[", "]"): + self.interleave( + lambda: self.write(", "), self.traverse, node.patterns + ) + + def visit_MatchStar(self, node): + name = node.name + if name is None: + name = "_" + self.write(f"*{name}") + + def visit_MatchMapping(self, node): + def write_key_pattern_pair(pair): + k, p = pair + self.traverse(k) + self.write(": ") + self.traverse(p) + + with self.delimit("{", "}"): + keys = node.keys + self.interleave( + lambda: self.write(", "), + write_key_pattern_pair, + zip(keys, node.patterns, strict=True), + ) + rest = node.rest + if rest is not None: + if keys: + self.write(", ") + self.write(f"**{rest}") + + def visit_MatchClass(self, node): + self.set_precedence(_Precedence.ATOM, node.cls) + self.traverse(node.cls) + with self.delimit("(", ")"): + patterns = node.patterns + self.interleave( + lambda: self.write(", "), self.traverse, patterns + ) + attrs = node.kwd_attrs + if attrs: + def write_attr_pattern(pair): + attr, pattern = pair + self.write(f"{attr}=") + self.traverse(pattern) + + if patterns: + self.write(", ") + self.interleave( + lambda: self.write(", "), + write_attr_pattern, + zip(attrs, node.kwd_patterns, strict=True), + ) + + def visit_MatchAs(self, node): + name = node.name + pattern = node.pattern + if name is None: + self.write("_") + elif pattern is None: + self.write(node.name) + else: + with self.require_parens(_Precedence.TEST, node): + self.set_precedence(_Precedence.BOR, node.pattern) + self.traverse(node.pattern) + self.write(f" as {node.name}") + + def visit_MatchOr(self, node): + with self.require_parens(_Precedence.BOR, node): + self.set_precedence(_Precedence.BOR.next(), *node.patterns) + self.interleave(lambda: self.write(" | "), self.traverse, node.patterns) diff --git a/python/python3_13/examples/_collections_abc.py b/python/python3_14/examples/_collections_abc.py similarity index 97% rename from python/python3_13/examples/_collections_abc.py rename to python/python3_14/examples/_collections_abc.py index aebe9c8b64..241d40d574 100644 --- a/python/python3_13/examples/_collections_abc.py +++ b/python/python3_14/examples/_collections_abc.py @@ -485,9 +485,10 @@ def __new__(cls, origin, args): def __repr__(self): if len(self.__args__) == 2 and _is_param_expr(self.__args__[0]): return super().__repr__() + from annotationlib import type_repr return (f'collections.abc.Callable' - f'[[{", ".join([_type_repr(a) for a in self.__args__[:-1]])}], ' - f'{_type_repr(self.__args__[-1])}]') + f'[[{", ".join([type_repr(a) for a in self.__args__[:-1]])}], ' + f'{type_repr(self.__args__[-1])}]') def __reduce__(self): args = self.__args__ @@ -524,23 +525,6 @@ def _is_param_expr(obj): names = ('ParamSpec', '_ConcatenateGenericAlias') return obj.__module__ == 'typing' and any(obj.__name__ == name for name in names) -def _type_repr(obj): - """Return the repr() of an object, special-casing types (internal helper). - - Copied from :mod:`typing` since collections.abc - shouldn't depend on that module. - (Keep this roughly in sync with the typing version.) - """ - if isinstance(obj, type): - if obj.__module__ == 'builtins': - return obj.__qualname__ - return f'{obj.__module__}.{obj.__qualname__}' - if obj is Ellipsis: - return '...' - if isinstance(obj, FunctionType): - return obj.__name__ - return repr(obj) - class Callable(metaclass=ABCMeta): @@ -1073,6 +1057,7 @@ def count(self, value): Sequence.register(tuple) Sequence.register(str) +Sequence.register(bytes) Sequence.register(range) Sequence.register(memoryview) @@ -1083,7 +1068,7 @@ def __new__(cls, name, bases, namespace, **kwargs): warnings._deprecated( "collections.abc.ByteString", - remove=(3, 14), + remove=(3, 17), ) return super().__new__(cls, name, bases, namespace, **kwargs) @@ -1092,14 +1077,18 @@ def __instancecheck__(cls, instance): warnings._deprecated( "collections.abc.ByteString", - remove=(3, 14), + remove=(3, 17), ) return super().__instancecheck__(instance) class ByteString(Sequence, metaclass=_DeprecateByteStringMeta): - """This unifies bytes and bytearray. + """Deprecated ABC serving as a common supertype of ``bytes`` and ``bytearray``. - XXX Should add all their methods. + This ABC is scheduled for removal in Python 3.17. + Use ``isinstance(obj, collections.abc.Buffer)`` to test if ``obj`` + implements the buffer protocol at runtime. For use in type annotations, + either use ``Buffer`` or a union that explicitly specifies the types your + code supports (e.g., ``bytes | bytearray | memoryview``). """ __slots__ = () @@ -1175,4 +1164,4 @@ def __iadd__(self, values): MutableSequence.register(list) -MutableSequence.register(bytearray) # Multiply inheriting, see ByteString +MutableSequence.register(bytearray) diff --git a/python/python3_14/examples/_colorize.py b/python/python3_14/examples/_colorize.py new file mode 100644 index 0000000000..d6673f6692 --- /dev/null +++ b/python/python3_14/examples/_colorize.py @@ -0,0 +1,355 @@ +import os +import sys + +from collections.abc import Callable, Iterator, Mapping +from dataclasses import dataclass, field, Field + +COLORIZE = True + + +# types +if False: + from typing import IO, Self, ClassVar + _theme: Theme + + +class ANSIColors: + RESET = "\x1b[0m" + + BLACK = "\x1b[30m" + BLUE = "\x1b[34m" + CYAN = "\x1b[36m" + GREEN = "\x1b[32m" + GREY = "\x1b[90m" + MAGENTA = "\x1b[35m" + RED = "\x1b[31m" + WHITE = "\x1b[37m" # more like LIGHT GRAY + YELLOW = "\x1b[33m" + + BOLD = "\x1b[1m" + BOLD_BLACK = "\x1b[1;30m" # DARK GRAY + BOLD_BLUE = "\x1b[1;34m" + BOLD_CYAN = "\x1b[1;36m" + BOLD_GREEN = "\x1b[1;32m" + BOLD_MAGENTA = "\x1b[1;35m" + BOLD_RED = "\x1b[1;31m" + BOLD_WHITE = "\x1b[1;37m" # actual WHITE + BOLD_YELLOW = "\x1b[1;33m" + + # intense = like bold but without being bold + INTENSE_BLACK = "\x1b[90m" + INTENSE_BLUE = "\x1b[94m" + INTENSE_CYAN = "\x1b[96m" + INTENSE_GREEN = "\x1b[92m" + INTENSE_MAGENTA = "\x1b[95m" + INTENSE_RED = "\x1b[91m" + INTENSE_WHITE = "\x1b[97m" + INTENSE_YELLOW = "\x1b[93m" + + BACKGROUND_BLACK = "\x1b[40m" + BACKGROUND_BLUE = "\x1b[44m" + BACKGROUND_CYAN = "\x1b[46m" + BACKGROUND_GREEN = "\x1b[42m" + BACKGROUND_MAGENTA = "\x1b[45m" + BACKGROUND_RED = "\x1b[41m" + BACKGROUND_WHITE = "\x1b[47m" + BACKGROUND_YELLOW = "\x1b[43m" + + INTENSE_BACKGROUND_BLACK = "\x1b[100m" + INTENSE_BACKGROUND_BLUE = "\x1b[104m" + INTENSE_BACKGROUND_CYAN = "\x1b[106m" + INTENSE_BACKGROUND_GREEN = "\x1b[102m" + INTENSE_BACKGROUND_MAGENTA = "\x1b[105m" + INTENSE_BACKGROUND_RED = "\x1b[101m" + INTENSE_BACKGROUND_WHITE = "\x1b[107m" + INTENSE_BACKGROUND_YELLOW = "\x1b[103m" + + +ColorCodes = set() +NoColors = ANSIColors() + +for attr, code in ANSIColors.__dict__.items(): + if not attr.startswith("__"): + ColorCodes.add(code) + setattr(NoColors, attr, "") + + +# +# Experimental theming support (see gh-133346) +# + +# - Create a theme by copying an existing `Theme` with one or more sections +# replaced, using `default_theme.copy_with()`; +# - create a theme section by copying an existing `ThemeSection` with one or +# more colors replaced, using for example `default_theme.syntax.copy_with()`; +# - create a theme from scratch by instantiating a `Theme` data class with +# the required sections (which are also dataclass instances). +# +# Then call `_colorize.set_theme(your_theme)` to set it. +# +# Put your theme configuration in $PYTHONSTARTUP for the interactive shell, +# or sitecustomize.py in your virtual environment or Python installation for +# other uses. Your applications can call `_colorize.set_theme()` too. +# +# Note that thanks to the dataclasses providing default values for all fields, +# creating a new theme or theme section from scratch is possible without +# specifying all keys. +# +# For example, here's a theme that makes punctuation and operators less prominent: +# +# try: +# from _colorize import set_theme, default_theme, Syntax, ANSIColors +# except ImportError: +# pass +# else: +# theme_with_dim_operators = default_theme.copy_with( +# syntax=Syntax(op=ANSIColors.INTENSE_BLACK), +# ) +# set_theme(theme_with_dim_operators) +# del set_theme, default_theme, Syntax, ANSIColors, theme_with_dim_operators +# +# Guarding the import ensures that your .pythonstartup file will still work in +# Python 3.13 and older. Deleting the variables ensures they don't remain in your +# interactive shell's global scope. + +class ThemeSection(Mapping[str, str]): + """A mixin/base class for theme sections. + + It enables dictionary access to a section, as well as implements convenience + methods. + """ + + # The two types below are just that: types to inform the type checker that the + # mixin will work in context of those fields existing + __dataclass_fields__: ClassVar[dict[str, Field[str]]] + _name_to_value: Callable[[str], str] + + def __post_init__(self) -> None: + name_to_value = {} + for color_name in self.__dataclass_fields__: + name_to_value[color_name] = getattr(self, color_name) + super().__setattr__('_name_to_value', name_to_value.__getitem__) + + def copy_with(self, **kwargs: str) -> Self: + color_state: dict[str, str] = {} + for color_name in self.__dataclass_fields__: + color_state[color_name] = getattr(self, color_name) + color_state.update(kwargs) + return type(self)(**color_state) + + @classmethod + def no_colors(cls) -> Self: + color_state: dict[str, str] = {} + for color_name in cls.__dataclass_fields__: + color_state[color_name] = "" + return cls(**color_state) + + def __getitem__(self, key: str) -> str: + return self._name_to_value(key) + + def __len__(self) -> int: + return len(self.__dataclass_fields__) + + def __iter__(self) -> Iterator[str]: + return iter(self.__dataclass_fields__) + + +@dataclass(frozen=True, kw_only=True) +class Argparse(ThemeSection): + usage: str = ANSIColors.BOLD_BLUE + prog: str = ANSIColors.BOLD_MAGENTA + prog_extra: str = ANSIColors.MAGENTA + heading: str = ANSIColors.BOLD_BLUE + summary_long_option: str = ANSIColors.CYAN + summary_short_option: str = ANSIColors.GREEN + summary_label: str = ANSIColors.YELLOW + summary_action: str = ANSIColors.GREEN + long_option: str = ANSIColors.BOLD_CYAN + short_option: str = ANSIColors.BOLD_GREEN + label: str = ANSIColors.BOLD_YELLOW + action: str = ANSIColors.BOLD_GREEN + reset: str = ANSIColors.RESET + + +@dataclass(frozen=True) +class Syntax(ThemeSection): + prompt: str = ANSIColors.BOLD_MAGENTA + keyword: str = ANSIColors.BOLD_BLUE + keyword_constant: str = ANSIColors.BOLD_BLUE + builtin: str = ANSIColors.CYAN + comment: str = ANSIColors.RED + string: str = ANSIColors.GREEN + number: str = ANSIColors.YELLOW + op: str = ANSIColors.RESET + definition: str = ANSIColors.BOLD + soft_keyword: str = ANSIColors.BOLD_BLUE + reset: str = ANSIColors.RESET + + +@dataclass(frozen=True) +class Traceback(ThemeSection): + type: str = ANSIColors.BOLD_MAGENTA + message: str = ANSIColors.MAGENTA + filename: str = ANSIColors.MAGENTA + line_no: str = ANSIColors.MAGENTA + frame: str = ANSIColors.MAGENTA + error_highlight: str = ANSIColors.BOLD_RED + error_range: str = ANSIColors.RED + reset: str = ANSIColors.RESET + + +@dataclass(frozen=True) +class Unittest(ThemeSection): + passed: str = ANSIColors.GREEN + warn: str = ANSIColors.YELLOW + fail: str = ANSIColors.RED + fail_info: str = ANSIColors.BOLD_RED + reset: str = ANSIColors.RESET + + +@dataclass(frozen=True) +class Theme: + """A suite of themes for all sections of Python. + + When adding a new one, remember to also modify `copy_with` and `no_colors` + below. + """ + argparse: Argparse = field(default_factory=Argparse) + syntax: Syntax = field(default_factory=Syntax) + traceback: Traceback = field(default_factory=Traceback) + unittest: Unittest = field(default_factory=Unittest) + + def copy_with( + self, + *, + argparse: Argparse | None = None, + syntax: Syntax | None = None, + traceback: Traceback | None = None, + unittest: Unittest | None = None, + ) -> Self: + """Return a new Theme based on this instance with some sections replaced. + + Themes are immutable to protect against accidental modifications that + could lead to invalid terminal states. + """ + return type(self)( + argparse=argparse or self.argparse, + syntax=syntax or self.syntax, + traceback=traceback or self.traceback, + unittest=unittest or self.unittest, + ) + + @classmethod + def no_colors(cls) -> Self: + """Return a new Theme where colors in all sections are empty strings. + + This allows writing user code as if colors are always used. The color + fields will be ANSI color code strings when colorization is desired + and possible, and empty strings otherwise. + """ + return cls( + argparse=Argparse.no_colors(), + syntax=Syntax.no_colors(), + traceback=Traceback.no_colors(), + unittest=Unittest.no_colors(), + ) + + +def get_colors( + colorize: bool = False, *, file: IO[str] | IO[bytes] | None = None +) -> ANSIColors: + if colorize or can_colorize(file=file): + return ANSIColors() + else: + return NoColors + + +def decolor(text: str) -> str: + """Remove ANSI color codes from a string.""" + for code in ColorCodes: + text = text.replace(code, "") + return text + + +def can_colorize(*, file: IO[str] | IO[bytes] | None = None) -> bool: + + def _safe_getenv(k: str, fallback: str | None = None) -> str | None: + """Exception-safe environment retrieval. See gh-128636.""" + try: + return os.environ.get(k, fallback) + except Exception: + return fallback + + if file is None: + file = sys.stdout + + if not sys.flags.ignore_environment: + if _safe_getenv("PYTHON_COLORS") == "0": + return False + if _safe_getenv("PYTHON_COLORS") == "1": + return True + if _safe_getenv("NO_COLOR"): + return False + if not COLORIZE: + return False + if _safe_getenv("FORCE_COLOR"): + return True + if _safe_getenv("TERM") == "dumb": + return False + + if not hasattr(file, "fileno"): + return False + + if sys.platform == "win32": + try: + import nt + + if not nt._supports_virtual_terminal(): + return False + except (ImportError, AttributeError): + return False + + try: + return os.isatty(file.fileno()) + except OSError: + return hasattr(file, "isatty") and file.isatty() + + +default_theme = Theme() +theme_no_color = default_theme.no_colors() + + +def get_theme( + *, + tty_file: IO[str] | IO[bytes] | None = None, + force_color: bool = False, + force_no_color: bool = False, +) -> Theme: + """Returns the currently set theme, potentially in a zero-color variant. + + In cases where colorizing is not possible (see `can_colorize`), the returned + theme contains all empty strings in all color definitions. + See `Theme.no_colors()` for more information. + + It is recommended not to cache the result of this function for extended + periods of time because the user might influence theme selection by + the interactive shell, a debugger, or application-specific code. The + environment (including environment variable state and console configuration + on Windows) can also change in the course of the application life cycle. + """ + if force_color or (not force_no_color and + can_colorize(file=tty_file)): + return _theme + return theme_no_color + + +def set_theme(t: Theme) -> None: + global _theme + + if not isinstance(t, Theme): + raise ValueError(f"Expected Theme object, found {t}") + + _theme = t + + +set_theme(default_theme) diff --git a/python/python3_13/examples/_compat_pickle.py b/python/python3_14/examples/_compat_pickle.py similarity index 100% rename from python/python3_13/examples/_compat_pickle.py rename to python/python3_14/examples/_compat_pickle.py diff --git a/python/python3_13/examples/_ios_support.py b/python/python3_14/examples/_ios_support.py similarity index 100% rename from python/python3_13/examples/_ios_support.py rename to python/python3_14/examples/_ios_support.py diff --git a/python/python3_13/examples/_markupbase.py b/python/python3_14/examples/_markupbase.py similarity index 99% rename from python/python3_13/examples/_markupbase.py rename to python/python3_14/examples/_markupbase.py index 3ad7e27996..614f0cd16d 100644 --- a/python/python3_13/examples/_markupbase.py +++ b/python/python3_14/examples/_markupbase.py @@ -13,7 +13,7 @@ _markedsectionclose = re.compile(r']\s*]\s*>') # An analysis of the MS-Word extensions is available at -# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf +# http://web.archive.org/web/20060321153828/http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf _msmarkedsectionclose = re.compile(r']\s*>') diff --git a/python/python3_14/examples/_opcode_metadata.py b/python/python3_14/examples/_opcode_metadata.py new file mode 100644 index 0000000000..b9304ec3c0 --- /dev/null +++ b/python/python3_14/examples/_opcode_metadata.py @@ -0,0 +1,371 @@ +# This file is generated by Tools/cases_generator/py_metadata_generator.py +# from: +# Python/bytecodes.c +# Do not edit! +_specializations = { + "RESUME": [ + "RESUME_CHECK", + ], + "LOAD_CONST": [ + "LOAD_CONST_MORTAL", + "LOAD_CONST_IMMORTAL", + ], + "TO_BOOL": [ + "TO_BOOL_ALWAYS_TRUE", + "TO_BOOL_BOOL", + "TO_BOOL_INT", + "TO_BOOL_LIST", + "TO_BOOL_NONE", + "TO_BOOL_STR", + ], + "BINARY_OP": [ + "BINARY_OP_MULTIPLY_INT", + "BINARY_OP_ADD_INT", + "BINARY_OP_SUBTRACT_INT", + "BINARY_OP_MULTIPLY_FLOAT", + "BINARY_OP_ADD_FLOAT", + "BINARY_OP_SUBTRACT_FLOAT", + "BINARY_OP_ADD_UNICODE", + "BINARY_OP_SUBSCR_LIST_INT", + "BINARY_OP_SUBSCR_LIST_SLICE", + "BINARY_OP_SUBSCR_TUPLE_INT", + "BINARY_OP_SUBSCR_STR_INT", + "BINARY_OP_SUBSCR_DICT", + "BINARY_OP_SUBSCR_GETITEM", + "BINARY_OP_EXTEND", + "BINARY_OP_INPLACE_ADD_UNICODE", + ], + "STORE_SUBSCR": [ + "STORE_SUBSCR_DICT", + "STORE_SUBSCR_LIST_INT", + ], + "SEND": [ + "SEND_GEN", + ], + "UNPACK_SEQUENCE": [ + "UNPACK_SEQUENCE_TWO_TUPLE", + "UNPACK_SEQUENCE_TUPLE", + "UNPACK_SEQUENCE_LIST", + ], + "STORE_ATTR": [ + "STORE_ATTR_INSTANCE_VALUE", + "STORE_ATTR_SLOT", + "STORE_ATTR_WITH_HINT", + ], + "LOAD_GLOBAL": [ + "LOAD_GLOBAL_MODULE", + "LOAD_GLOBAL_BUILTIN", + ], + "LOAD_SUPER_ATTR": [ + "LOAD_SUPER_ATTR_ATTR", + "LOAD_SUPER_ATTR_METHOD", + ], + "LOAD_ATTR": [ + "LOAD_ATTR_INSTANCE_VALUE", + "LOAD_ATTR_MODULE", + "LOAD_ATTR_WITH_HINT", + "LOAD_ATTR_SLOT", + "LOAD_ATTR_CLASS", + "LOAD_ATTR_CLASS_WITH_METACLASS_CHECK", + "LOAD_ATTR_PROPERTY", + "LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN", + "LOAD_ATTR_METHOD_WITH_VALUES", + "LOAD_ATTR_METHOD_NO_DICT", + "LOAD_ATTR_METHOD_LAZY_DICT", + "LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES", + "LOAD_ATTR_NONDESCRIPTOR_NO_DICT", + ], + "COMPARE_OP": [ + "COMPARE_OP_FLOAT", + "COMPARE_OP_INT", + "COMPARE_OP_STR", + ], + "CONTAINS_OP": [ + "CONTAINS_OP_SET", + "CONTAINS_OP_DICT", + ], + "JUMP_BACKWARD": [ + "JUMP_BACKWARD_NO_JIT", + "JUMP_BACKWARD_JIT", + ], + "FOR_ITER": [ + "FOR_ITER_LIST", + "FOR_ITER_TUPLE", + "FOR_ITER_RANGE", + "FOR_ITER_GEN", + ], + "CALL": [ + "CALL_BOUND_METHOD_EXACT_ARGS", + "CALL_PY_EXACT_ARGS", + "CALL_TYPE_1", + "CALL_STR_1", + "CALL_TUPLE_1", + "CALL_BUILTIN_CLASS", + "CALL_BUILTIN_O", + "CALL_BUILTIN_FAST", + "CALL_BUILTIN_FAST_WITH_KEYWORDS", + "CALL_LEN", + "CALL_ISINSTANCE", + "CALL_LIST_APPEND", + "CALL_METHOD_DESCRIPTOR_O", + "CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS", + "CALL_METHOD_DESCRIPTOR_NOARGS", + "CALL_METHOD_DESCRIPTOR_FAST", + "CALL_ALLOC_AND_ENTER_INIT", + "CALL_PY_GENERAL", + "CALL_BOUND_METHOD_GENERAL", + "CALL_NON_PY_GENERAL", + ], + "CALL_KW": [ + "CALL_KW_BOUND_METHOD", + "CALL_KW_PY", + "CALL_KW_NON_PY", + ], +} + +_specialized_opmap = { + 'BINARY_OP_ADD_FLOAT': 129, + 'BINARY_OP_ADD_INT': 130, + 'BINARY_OP_ADD_UNICODE': 131, + 'BINARY_OP_EXTEND': 132, + 'BINARY_OP_INPLACE_ADD_UNICODE': 3, + 'BINARY_OP_MULTIPLY_FLOAT': 133, + 'BINARY_OP_MULTIPLY_INT': 134, + 'BINARY_OP_SUBSCR_DICT': 135, + 'BINARY_OP_SUBSCR_GETITEM': 136, + 'BINARY_OP_SUBSCR_LIST_INT': 137, + 'BINARY_OP_SUBSCR_LIST_SLICE': 138, + 'BINARY_OP_SUBSCR_STR_INT': 139, + 'BINARY_OP_SUBSCR_TUPLE_INT': 140, + 'BINARY_OP_SUBTRACT_FLOAT': 141, + 'BINARY_OP_SUBTRACT_INT': 142, + 'CALL_ALLOC_AND_ENTER_INIT': 143, + 'CALL_BOUND_METHOD_EXACT_ARGS': 144, + 'CALL_BOUND_METHOD_GENERAL': 145, + 'CALL_BUILTIN_CLASS': 146, + 'CALL_BUILTIN_FAST': 147, + 'CALL_BUILTIN_FAST_WITH_KEYWORDS': 148, + 'CALL_BUILTIN_O': 149, + 'CALL_ISINSTANCE': 150, + 'CALL_KW_BOUND_METHOD': 151, + 'CALL_KW_NON_PY': 152, + 'CALL_KW_PY': 153, + 'CALL_LEN': 154, + 'CALL_LIST_APPEND': 155, + 'CALL_METHOD_DESCRIPTOR_FAST': 156, + 'CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS': 157, + 'CALL_METHOD_DESCRIPTOR_NOARGS': 158, + 'CALL_METHOD_DESCRIPTOR_O': 159, + 'CALL_NON_PY_GENERAL': 160, + 'CALL_PY_EXACT_ARGS': 161, + 'CALL_PY_GENERAL': 162, + 'CALL_STR_1': 163, + 'CALL_TUPLE_1': 164, + 'CALL_TYPE_1': 165, + 'COMPARE_OP_FLOAT': 166, + 'COMPARE_OP_INT': 167, + 'COMPARE_OP_STR': 168, + 'CONTAINS_OP_DICT': 169, + 'CONTAINS_OP_SET': 170, + 'FOR_ITER_GEN': 171, + 'FOR_ITER_LIST': 172, + 'FOR_ITER_RANGE': 173, + 'FOR_ITER_TUPLE': 174, + 'JUMP_BACKWARD_JIT': 175, + 'JUMP_BACKWARD_NO_JIT': 176, + 'LOAD_ATTR_CLASS': 177, + 'LOAD_ATTR_CLASS_WITH_METACLASS_CHECK': 178, + 'LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN': 179, + 'LOAD_ATTR_INSTANCE_VALUE': 180, + 'LOAD_ATTR_METHOD_LAZY_DICT': 181, + 'LOAD_ATTR_METHOD_NO_DICT': 182, + 'LOAD_ATTR_METHOD_WITH_VALUES': 183, + 'LOAD_ATTR_MODULE': 184, + 'LOAD_ATTR_NONDESCRIPTOR_NO_DICT': 185, + 'LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES': 186, + 'LOAD_ATTR_PROPERTY': 187, + 'LOAD_ATTR_SLOT': 188, + 'LOAD_ATTR_WITH_HINT': 189, + 'LOAD_CONST_IMMORTAL': 190, + 'LOAD_CONST_MORTAL': 191, + 'LOAD_GLOBAL_BUILTIN': 192, + 'LOAD_GLOBAL_MODULE': 193, + 'LOAD_SUPER_ATTR_ATTR': 194, + 'LOAD_SUPER_ATTR_METHOD': 195, + 'RESUME_CHECK': 196, + 'SEND_GEN': 197, + 'STORE_ATTR_INSTANCE_VALUE': 198, + 'STORE_ATTR_SLOT': 199, + 'STORE_ATTR_WITH_HINT': 200, + 'STORE_SUBSCR_DICT': 201, + 'STORE_SUBSCR_LIST_INT': 202, + 'TO_BOOL_ALWAYS_TRUE': 203, + 'TO_BOOL_BOOL': 204, + 'TO_BOOL_INT': 205, + 'TO_BOOL_LIST': 206, + 'TO_BOOL_NONE': 207, + 'TO_BOOL_STR': 208, + 'UNPACK_SEQUENCE_LIST': 209, + 'UNPACK_SEQUENCE_TUPLE': 210, + 'UNPACK_SEQUENCE_TWO_TUPLE': 211, +} + +opmap = { + 'CACHE': 0, + 'RESERVED': 17, + 'RESUME': 128, + 'INSTRUMENTED_LINE': 254, + 'ENTER_EXECUTOR': 255, + 'BINARY_SLICE': 1, + 'BUILD_TEMPLATE': 2, + 'CALL_FUNCTION_EX': 4, + 'CHECK_EG_MATCH': 5, + 'CHECK_EXC_MATCH': 6, + 'CLEANUP_THROW': 7, + 'DELETE_SUBSCR': 8, + 'END_FOR': 9, + 'END_SEND': 10, + 'EXIT_INIT_CHECK': 11, + 'FORMAT_SIMPLE': 12, + 'FORMAT_WITH_SPEC': 13, + 'GET_AITER': 14, + 'GET_ANEXT': 15, + 'GET_ITER': 16, + 'GET_LEN': 18, + 'GET_YIELD_FROM_ITER': 19, + 'INTERPRETER_EXIT': 20, + 'LOAD_BUILD_CLASS': 21, + 'LOAD_LOCALS': 22, + 'MAKE_FUNCTION': 23, + 'MATCH_KEYS': 24, + 'MATCH_MAPPING': 25, + 'MATCH_SEQUENCE': 26, + 'NOP': 27, + 'NOT_TAKEN': 28, + 'POP_EXCEPT': 29, + 'POP_ITER': 30, + 'POP_TOP': 31, + 'PUSH_EXC_INFO': 32, + 'PUSH_NULL': 33, + 'RETURN_GENERATOR': 34, + 'RETURN_VALUE': 35, + 'SETUP_ANNOTATIONS': 36, + 'STORE_SLICE': 37, + 'STORE_SUBSCR': 38, + 'TO_BOOL': 39, + 'UNARY_INVERT': 40, + 'UNARY_NEGATIVE': 41, + 'UNARY_NOT': 42, + 'WITH_EXCEPT_START': 43, + 'BINARY_OP': 44, + 'BUILD_INTERPOLATION': 45, + 'BUILD_LIST': 46, + 'BUILD_MAP': 47, + 'BUILD_SET': 48, + 'BUILD_SLICE': 49, + 'BUILD_STRING': 50, + 'BUILD_TUPLE': 51, + 'CALL': 52, + 'CALL_INTRINSIC_1': 53, + 'CALL_INTRINSIC_2': 54, + 'CALL_KW': 55, + 'COMPARE_OP': 56, + 'CONTAINS_OP': 57, + 'CONVERT_VALUE': 58, + 'COPY': 59, + 'COPY_FREE_VARS': 60, + 'DELETE_ATTR': 61, + 'DELETE_DEREF': 62, + 'DELETE_FAST': 63, + 'DELETE_GLOBAL': 64, + 'DELETE_NAME': 65, + 'DICT_MERGE': 66, + 'DICT_UPDATE': 67, + 'END_ASYNC_FOR': 68, + 'EXTENDED_ARG': 69, + 'FOR_ITER': 70, + 'GET_AWAITABLE': 71, + 'IMPORT_FROM': 72, + 'IMPORT_NAME': 73, + 'IS_OP': 74, + 'JUMP_BACKWARD': 75, + 'JUMP_BACKWARD_NO_INTERRUPT': 76, + 'JUMP_FORWARD': 77, + 'LIST_APPEND': 78, + 'LIST_EXTEND': 79, + 'LOAD_ATTR': 80, + 'LOAD_COMMON_CONSTANT': 81, + 'LOAD_CONST': 82, + 'LOAD_DEREF': 83, + 'LOAD_FAST': 84, + 'LOAD_FAST_AND_CLEAR': 85, + 'LOAD_FAST_BORROW': 86, + 'LOAD_FAST_BORROW_LOAD_FAST_BORROW': 87, + 'LOAD_FAST_CHECK': 88, + 'LOAD_FAST_LOAD_FAST': 89, + 'LOAD_FROM_DICT_OR_DEREF': 90, + 'LOAD_FROM_DICT_OR_GLOBALS': 91, + 'LOAD_GLOBAL': 92, + 'LOAD_NAME': 93, + 'LOAD_SMALL_INT': 94, + 'LOAD_SPECIAL': 95, + 'LOAD_SUPER_ATTR': 96, + 'MAKE_CELL': 97, + 'MAP_ADD': 98, + 'MATCH_CLASS': 99, + 'POP_JUMP_IF_FALSE': 100, + 'POP_JUMP_IF_NONE': 101, + 'POP_JUMP_IF_NOT_NONE': 102, + 'POP_JUMP_IF_TRUE': 103, + 'RAISE_VARARGS': 104, + 'RERAISE': 105, + 'SEND': 106, + 'SET_ADD': 107, + 'SET_FUNCTION_ATTRIBUTE': 108, + 'SET_UPDATE': 109, + 'STORE_ATTR': 110, + 'STORE_DEREF': 111, + 'STORE_FAST': 112, + 'STORE_FAST_LOAD_FAST': 113, + 'STORE_FAST_STORE_FAST': 114, + 'STORE_GLOBAL': 115, + 'STORE_NAME': 116, + 'SWAP': 117, + 'UNPACK_EX': 118, + 'UNPACK_SEQUENCE': 119, + 'YIELD_VALUE': 120, + 'INSTRUMENTED_END_FOR': 234, + 'INSTRUMENTED_POP_ITER': 235, + 'INSTRUMENTED_END_SEND': 236, + 'INSTRUMENTED_FOR_ITER': 237, + 'INSTRUMENTED_INSTRUCTION': 238, + 'INSTRUMENTED_JUMP_FORWARD': 239, + 'INSTRUMENTED_NOT_TAKEN': 240, + 'INSTRUMENTED_POP_JUMP_IF_TRUE': 241, + 'INSTRUMENTED_POP_JUMP_IF_FALSE': 242, + 'INSTRUMENTED_POP_JUMP_IF_NONE': 243, + 'INSTRUMENTED_POP_JUMP_IF_NOT_NONE': 244, + 'INSTRUMENTED_RESUME': 245, + 'INSTRUMENTED_RETURN_VALUE': 246, + 'INSTRUMENTED_YIELD_VALUE': 247, + 'INSTRUMENTED_END_ASYNC_FOR': 248, + 'INSTRUMENTED_LOAD_SUPER_ATTR': 249, + 'INSTRUMENTED_CALL': 250, + 'INSTRUMENTED_CALL_KW': 251, + 'INSTRUMENTED_CALL_FUNCTION_EX': 252, + 'INSTRUMENTED_JUMP_BACKWARD': 253, + 'ANNOTATIONS_PLACEHOLDER': 256, + 'JUMP': 257, + 'JUMP_IF_FALSE': 258, + 'JUMP_IF_TRUE': 259, + 'JUMP_NO_INTERRUPT': 260, + 'LOAD_CLOSURE': 261, + 'POP_BLOCK': 262, + 'SETUP_CLEANUP': 263, + 'SETUP_FINALLY': 264, + 'SETUP_WITH': 265, + 'STORE_FAST_MAYBE_NULL': 266, +} + +HAVE_ARGUMENT = 43 +MIN_INSTRUMENTED_OPCODE = 234 diff --git a/python/python3_13/examples/_osx_support.py b/python/python3_14/examples/_osx_support.py similarity index 100% rename from python/python3_13/examples/_osx_support.py rename to python/python3_14/examples/_osx_support.py diff --git a/python/python3_13/examples/_py_abc.py b/python/python3_14/examples/_py_abc.py similarity index 100% rename from python/python3_13/examples/_py_abc.py rename to python/python3_14/examples/_py_abc.py diff --git a/python/python3_14/examples/_py_warnings.py b/python/python3_14/examples/_py_warnings.py new file mode 100644 index 0000000000..55f8c06959 --- /dev/null +++ b/python/python3_14/examples/_py_warnings.py @@ -0,0 +1,869 @@ +"""Python part of the warnings subsystem.""" + +import sys +import _contextvars +import _thread + + +__all__ = ["warn", "warn_explicit", "showwarning", + "formatwarning", "filterwarnings", "simplefilter", + "resetwarnings", "catch_warnings", "deprecated"] + + +# Normally '_wm' is sys.modules['warnings'] but for unit tests it can be +# a different module. User code is allowed to reassign global attributes +# of the 'warnings' module, commonly 'filters' or 'showwarning'. So we +# need to lookup these global attributes dynamically on the '_wm' object, +# rather than binding them earlier. The code in this module consistently uses +# '_wm.' rather than using the globals of this module. If the +# '_warnings' C extension is in use, some globals are replaced by functions +# and variables defined in that extension. +_wm = None + + +def _set_module(module): + global _wm + _wm = module + + +# filters contains a sequence of filter 5-tuples +# The components of the 5-tuple are: +# - an action: error, ignore, always, all, default, module, or once +# - a compiled regex that must match the warning message +# - a class representing the warning category +# - a compiled regex that must match the module that is being warned +# - a line number for the line being warning, or 0 to mean any line +# If either if the compiled regexs are None, match anything. +filters = [] + + +defaultaction = "default" +onceregistry = {} +_lock = _thread.RLock() +_filters_version = 1 + + +# If true, catch_warnings() will use a context var to hold the modified +# filters list. Otherwise, catch_warnings() will operate on the 'filters' +# global of the warnings module. +_use_context = sys.flags.context_aware_warnings + + +class _Context: + def __init__(self, filters): + self._filters = filters + self.log = None # if set to a list, logging is enabled + + def copy(self): + context = _Context(self._filters[:]) + if self.log is not None: + context.log = self.log + return context + + def _record_warning(self, msg): + self.log.append(msg) + + +class _GlobalContext(_Context): + def __init__(self): + self.log = None + + @property + def _filters(self): + # Since there is quite a lot of code that assigns to + # warnings.filters, this needs to return the current value of + # the module global. + try: + return _wm.filters + except AttributeError: + # 'filters' global was deleted. Do we need to actually handle this case? + return [] + + +_global_context = _GlobalContext() + + +_warnings_context = _contextvars.ContextVar('warnings_context') + + +def _get_context(): + if not _use_context: + return _global_context + try: + return _wm._warnings_context.get() + except LookupError: + return _global_context + + +def _set_context(context): + assert _use_context + _wm._warnings_context.set(context) + + +def _new_context(): + assert _use_context + old_context = _wm._get_context() + new_context = old_context.copy() + _wm._set_context(new_context) + return old_context, new_context + + +def _get_filters(): + """Return the current list of filters. This is a non-public API used by + module functions and by the unit tests.""" + return _wm._get_context()._filters + + +def _filters_mutated_lock_held(): + _wm._filters_version += 1 + + +def showwarning(message, category, filename, lineno, file=None, line=None): + """Hook to write a warning to a file; replace if you like.""" + msg = _wm.WarningMessage(message, category, filename, lineno, file, line) + _wm._showwarnmsg_impl(msg) + + +def formatwarning(message, category, filename, lineno, line=None): + """Function to format a warning the standard way.""" + msg = _wm.WarningMessage(message, category, filename, lineno, None, line) + return _wm._formatwarnmsg_impl(msg) + + +def _showwarnmsg_impl(msg): + context = _wm._get_context() + if context.log is not None: + context._record_warning(msg) + return + file = msg.file + if file is None: + file = sys.stderr + if file is None: + # sys.stderr is None when run with pythonw.exe: + # warnings get lost + return + text = _wm._formatwarnmsg(msg) + try: + file.write(text) + except OSError: + # the file (probably stderr) is invalid - this warning gets lost. + pass + + +def _formatwarnmsg_impl(msg): + category = msg.category.__name__ + s = f"{msg.filename}:{msg.lineno}: {category}: {msg.message}\n" + + if msg.line is None: + try: + import linecache + line = linecache.getline(msg.filename, msg.lineno) + except Exception: + # When a warning is logged during Python shutdown, linecache + # and the import machinery don't work anymore + line = None + linecache = None + else: + line = msg.line + if line: + line = line.strip() + s += " %s\n" % line + + if msg.source is not None: + try: + import tracemalloc + # Logging a warning should not raise a new exception: + # catch Exception, not only ImportError and RecursionError. + except Exception: + # don't suggest to enable tracemalloc if it's not available + suggest_tracemalloc = False + tb = None + else: + try: + suggest_tracemalloc = not tracemalloc.is_tracing() + tb = tracemalloc.get_object_traceback(msg.source) + except Exception: + # When a warning is logged during Python shutdown, tracemalloc + # and the import machinery don't work anymore + suggest_tracemalloc = False + tb = None + + if tb is not None: + s += 'Object allocated at (most recent call last):\n' + for frame in tb: + s += (' File "%s", lineno %s\n' + % (frame.filename, frame.lineno)) + + try: + if linecache is not None: + line = linecache.getline(frame.filename, frame.lineno) + else: + line = None + except Exception: + line = None + if line: + line = line.strip() + s += ' %s\n' % line + elif suggest_tracemalloc: + s += (f'{category}: Enable tracemalloc to get the object ' + f'allocation traceback\n') + return s + + +# Keep a reference to check if the function was replaced +_showwarning_orig = showwarning + + +def _showwarnmsg(msg): + """Hook to write a warning to a file; replace if you like.""" + try: + sw = _wm.showwarning + except AttributeError: + pass + else: + if sw is not _showwarning_orig: + # warnings.showwarning() was replaced + if not callable(sw): + raise TypeError("warnings.showwarning() must be set to a " + "function or method") + + sw(msg.message, msg.category, msg.filename, msg.lineno, + msg.file, msg.line) + return + _wm._showwarnmsg_impl(msg) + + +# Keep a reference to check if the function was replaced +_formatwarning_orig = formatwarning + + +def _formatwarnmsg(msg): + """Function to format a warning the standard way.""" + try: + fw = _wm.formatwarning + except AttributeError: + pass + else: + if fw is not _formatwarning_orig: + # warnings.formatwarning() was replaced + return fw(msg.message, msg.category, + msg.filename, msg.lineno, msg.line) + return _wm._formatwarnmsg_impl(msg) + + +def filterwarnings(action, message="", category=Warning, module="", lineno=0, + append=False): + """Insert an entry into the list of warnings filters (at the front). + + 'action' -- one of "error", "ignore", "always", "all", "default", "module", + or "once" + 'message' -- a regex that the warning message must match + 'category' -- a class that the warning must be a subclass of + 'module' -- a regex that the module name must match + 'lineno' -- an integer line number, 0 matches all warnings + 'append' -- if true, append to the list of filters + """ + if action not in {"error", "ignore", "always", "all", "default", "module", "once"}: + raise ValueError(f"invalid action: {action!r}") + if not isinstance(message, str): + raise TypeError("message must be a string") + if not isinstance(category, type) or not issubclass(category, Warning): + raise TypeError("category must be a Warning subclass") + if not isinstance(module, str): + raise TypeError("module must be a string") + if not isinstance(lineno, int): + raise TypeError("lineno must be an int") + if lineno < 0: + raise ValueError("lineno must be an int >= 0") + + if message or module: + import re + + if message: + message = re.compile(message, re.I) + else: + message = None + if module: + module = re.compile(module) + else: + module = None + + _wm._add_filter(action, message, category, module, lineno, append=append) + + +def simplefilter(action, category=Warning, lineno=0, append=False): + """Insert a simple entry into the list of warnings filters (at the front). + + A simple filter matches all modules and messages. + 'action' -- one of "error", "ignore", "always", "all", "default", "module", + or "once" + 'category' -- a class that the warning must be a subclass of + 'lineno' -- an integer line number, 0 matches all warnings + 'append' -- if true, append to the list of filters + """ + if action not in {"error", "ignore", "always", "all", "default", "module", "once"}: + raise ValueError(f"invalid action: {action!r}") + if not isinstance(lineno, int): + raise TypeError("lineno must be an int") + if lineno < 0: + raise ValueError("lineno must be an int >= 0") + _wm._add_filter(action, None, category, None, lineno, append=append) + + +def _filters_mutated(): + # Even though this function is not part of the public API, it's used by + # a fair amount of user code. + with _wm._lock: + _wm._filters_mutated_lock_held() + + +def _add_filter(*item, append): + with _wm._lock: + filters = _wm._get_filters() + if not append: + # Remove possible duplicate filters, so new one will be placed + # in correct place. If append=True and duplicate exists, do nothing. + try: + filters.remove(item) + except ValueError: + pass + filters.insert(0, item) + else: + if item not in filters: + filters.append(item) + _wm._filters_mutated_lock_held() + + +def resetwarnings(): + """Clear the list of warning filters, so that no filters are active.""" + with _wm._lock: + del _wm._get_filters()[:] + _wm._filters_mutated_lock_held() + + +class _OptionError(Exception): + """Exception used by option processing helpers.""" + pass + + +# Helper to process -W options passed via sys.warnoptions +def _processoptions(args): + for arg in args: + try: + _wm._setoption(arg) + except _wm._OptionError as msg: + print("Invalid -W option ignored:", msg, file=sys.stderr) + + +# Helper for _processoptions() +def _setoption(arg): + parts = arg.split(':') + if len(parts) > 5: + raise _wm._OptionError("too many fields (max 5): %r" % (arg,)) + while len(parts) < 5: + parts.append('') + action, message, category, module, lineno = [s.strip() + for s in parts] + action = _wm._getaction(action) + category = _wm._getcategory(category) + if message or module: + import re + if message: + message = re.escape(message) + if module: + module = re.escape(module) + r'\z' + if lineno: + try: + lineno = int(lineno) + if lineno < 0: + raise ValueError + except (ValueError, OverflowError): + raise _wm._OptionError("invalid lineno %r" % (lineno,)) from None + else: + lineno = 0 + _wm.filterwarnings(action, message, category, module, lineno) + + +# Helper for _setoption() +def _getaction(action): + if not action: + return "default" + for a in ('default', 'always', 'all', 'ignore', 'module', 'once', 'error'): + if a.startswith(action): + return a + raise _wm._OptionError("invalid action: %r" % (action,)) + + +# Helper for _setoption() +def _getcategory(category): + if not category: + return Warning + if '.' not in category: + import builtins as m + klass = category + else: + module, _, klass = category.rpartition('.') + try: + m = __import__(module, None, None, [klass]) + except ImportError: + raise _wm._OptionError("invalid module name: %r" % (module,)) from None + try: + cat = getattr(m, klass) + except AttributeError: + raise _wm._OptionError("unknown warning category: %r" % (category,)) from None + if not issubclass(cat, Warning): + raise _wm._OptionError("invalid warning category: %r" % (category,)) + return cat + + +def _is_internal_filename(filename): + return 'importlib' in filename and '_bootstrap' in filename + + +def _is_filename_to_skip(filename, skip_file_prefixes): + return any(filename.startswith(prefix) for prefix in skip_file_prefixes) + + +def _is_internal_frame(frame): + """Signal whether the frame is an internal CPython implementation detail.""" + return _is_internal_filename(frame.f_code.co_filename) + + +def _next_external_frame(frame, skip_file_prefixes): + """Find the next frame that doesn't involve Python or user internals.""" + frame = frame.f_back + while frame is not None and ( + _is_internal_filename(filename := frame.f_code.co_filename) or + _is_filename_to_skip(filename, skip_file_prefixes)): + frame = frame.f_back + return frame + + +# Code typically replaced by _warnings +def warn(message, category=None, stacklevel=1, source=None, + *, skip_file_prefixes=()): + """Issue a warning, or maybe ignore it or raise an exception.""" + # Check if message is already a Warning object + if isinstance(message, Warning): + category = message.__class__ + # Check category argument + if category is None: + category = UserWarning + if not (isinstance(category, type) and issubclass(category, Warning)): + raise TypeError("category must be a Warning subclass, " + "not '{:s}'".format(type(category).__name__)) + if not isinstance(skip_file_prefixes, tuple): + # The C version demands a tuple for implementation performance. + raise TypeError('skip_file_prefixes must be a tuple of strs.') + if skip_file_prefixes: + stacklevel = max(2, stacklevel) + # Get context information + try: + if stacklevel <= 1 or _is_internal_frame(sys._getframe(1)): + # If frame is too small to care or if the warning originated in + # internal code, then do not try to hide any frames. + frame = sys._getframe(stacklevel) + else: + frame = sys._getframe(1) + # Look for one frame less since the above line starts us off. + for x in range(stacklevel-1): + frame = _next_external_frame(frame, skip_file_prefixes) + if frame is None: + raise ValueError + except ValueError: + globals = sys.__dict__ + filename = "" + lineno = 0 + else: + globals = frame.f_globals + filename = frame.f_code.co_filename + lineno = frame.f_lineno + if '__name__' in globals: + module = globals['__name__'] + else: + module = "" + registry = globals.setdefault("__warningregistry__", {}) + _wm.warn_explicit( + message, + category, + filename, + lineno, + module, + registry, + globals, + source=source, + ) + + +def warn_explicit(message, category, filename, lineno, + module=None, registry=None, module_globals=None, + source=None): + lineno = int(lineno) + if module is None: + module = filename or "" + if module[-3:].lower() == ".py": + module = module[:-3] # XXX What about leading pathname? + if isinstance(message, Warning): + text = str(message) + category = message.__class__ + else: + text = message + message = category(message) + key = (text, category, lineno) + with _wm._lock: + if registry is None: + registry = {} + if registry.get('version', 0) != _wm._filters_version: + registry.clear() + registry['version'] = _wm._filters_version + # Quick test for common case + if registry.get(key): + return + # Search the filters + for item in _wm._get_filters(): + action, msg, cat, mod, ln = item + if ((msg is None or msg.match(text)) and + issubclass(category, cat) and + (mod is None or mod.match(module)) and + (ln == 0 or lineno == ln)): + break + else: + action = _wm.defaultaction + # Early exit actions + if action == "ignore": + return + + if action == "error": + raise message + # Other actions + if action == "once": + registry[key] = 1 + oncekey = (text, category) + if _wm.onceregistry.get(oncekey): + return + _wm.onceregistry[oncekey] = 1 + elif action in {"always", "all"}: + pass + elif action == "module": + registry[key] = 1 + altkey = (text, category, 0) + if registry.get(altkey): + return + registry[altkey] = 1 + elif action == "default": + registry[key] = 1 + else: + # Unrecognized actions are errors + raise RuntimeError( + "Unrecognized action (%r) in warnings.filters:\n %s" % + (action, item)) + + # Prime the linecache for formatting, in case the + # "file" is actually in a zipfile or something. + import linecache + linecache.getlines(filename, module_globals) + + # Print message and context + msg = _wm.WarningMessage(message, category, filename, lineno, source=source) + _wm._showwarnmsg(msg) + + +class WarningMessage(object): + + _WARNING_DETAILS = ("message", "category", "filename", "lineno", "file", + "line", "source") + + def __init__(self, message, category, filename, lineno, file=None, + line=None, source=None): + self.message = message + self.category = category + self.filename = filename + self.lineno = lineno + self.file = file + self.line = line + self.source = source + self._category_name = category.__name__ if category else None + + def __str__(self): + return ("{message : %r, category : %r, filename : %r, lineno : %s, " + "line : %r}" % (self.message, self._category_name, + self.filename, self.lineno, self.line)) + + def __repr__(self): + return f'<{type(self).__qualname__} {self}>' + + +class catch_warnings(object): + + """A context manager that copies and restores the warnings filter upon + exiting the context. + + The 'record' argument specifies whether warnings should be captured by a + custom implementation of warnings.showwarning() and be appended to a list + returned by the context manager. Otherwise None is returned by the context + manager. The objects appended to the list are arguments whose attributes + mirror the arguments to showwarning(). + + The 'module' argument is to specify an alternative module to the module + named 'warnings' and imported under that name. This argument is only useful + when testing the warnings module itself. + + If the 'action' argument is not None, the remaining arguments are passed + to warnings.simplefilter() as if it were called immediately on entering the + context. + """ + + def __init__(self, *, record=False, module=None, + action=None, category=Warning, lineno=0, append=False): + """Specify whether to record warnings and if an alternative module + should be used other than sys.modules['warnings']. + + """ + self._record = record + self._module = sys.modules['warnings'] if module is None else module + self._entered = False + if action is None: + self._filter = None + else: + self._filter = (action, category, lineno, append) + + def __repr__(self): + args = [] + if self._record: + args.append("record=True") + if self._module is not sys.modules['warnings']: + args.append("module=%r" % self._module) + name = type(self).__name__ + return "%s(%s)" % (name, ", ".join(args)) + + def __enter__(self): + if self._entered: + raise RuntimeError("Cannot enter %r twice" % self) + self._entered = True + with _wm._lock: + if _use_context: + self._saved_context, context = self._module._new_context() + else: + context = None + self._filters = self._module.filters + self._module.filters = self._filters[:] + self._showwarning = self._module.showwarning + self._showwarnmsg_impl = self._module._showwarnmsg_impl + self._module._filters_mutated_lock_held() + if self._record: + if _use_context: + context.log = log = [] + else: + log = [] + self._module._showwarnmsg_impl = log.append + # Reset showwarning() to the default implementation to make sure + # that _showwarnmsg() calls _showwarnmsg_impl() + self._module.showwarning = self._module._showwarning_orig + else: + log = None + if self._filter is not None: + self._module.simplefilter(*self._filter) + return log + + def __exit__(self, *exc_info): + if not self._entered: + raise RuntimeError("Cannot exit %r without entering first" % self) + with _wm._lock: + if _use_context: + self._module._warnings_context.set(self._saved_context) + else: + self._module.filters = self._filters + self._module.showwarning = self._showwarning + self._module._showwarnmsg_impl = self._showwarnmsg_impl + self._module._filters_mutated_lock_held() + + +class deprecated: + """Indicate that a class, function or overload is deprecated. + + When this decorator is applied to an object, the type checker + will generate a diagnostic on usage of the deprecated object. + + Usage: + + @deprecated("Use B instead") + class A: + pass + + @deprecated("Use g instead") + def f(): + pass + + @overload + @deprecated("int support is deprecated") + def g(x: int) -> int: ... + @overload + def g(x: str) -> int: ... + + The warning specified by *category* will be emitted at runtime + on use of deprecated objects. For functions, that happens on calls; + for classes, on instantiation and on creation of subclasses. + If the *category* is ``None``, no warning is emitted at runtime. + The *stacklevel* determines where the + warning is emitted. If it is ``1`` (the default), the warning + is emitted at the direct caller of the deprecated object; if it + is higher, it is emitted further up the stack. + Static type checker behavior is not affected by the *category* + and *stacklevel* arguments. + + The deprecation message passed to the decorator is saved in the + ``__deprecated__`` attribute on the decorated object. + If applied to an overload, the decorator + must be after the ``@overload`` decorator for the attribute to + exist on the overload as returned by ``get_overloads()``. + + See PEP 702 for details. + + """ + def __init__( + self, + message: str, + /, + *, + category: type[Warning] | None = DeprecationWarning, + stacklevel: int = 1, + ) -> None: + if not isinstance(message, str): + raise TypeError( + f"Expected an object of type str for 'message', not {type(message).__name__!r}" + ) + self.message = message + self.category = category + self.stacklevel = stacklevel + + def __call__(self, arg, /): + # Make sure the inner functions created below don't + # retain a reference to self. + msg = self.message + category = self.category + stacklevel = self.stacklevel + if category is None: + arg.__deprecated__ = msg + return arg + elif isinstance(arg, type): + import functools + from types import MethodType + + original_new = arg.__new__ + + @functools.wraps(original_new) + def __new__(cls, /, *args, **kwargs): + if cls is arg: + _wm.warn(msg, category=category, stacklevel=stacklevel + 1) + if original_new is not object.__new__: + return original_new(cls, *args, **kwargs) + # Mirrors a similar check in object.__new__. + elif cls.__init__ is object.__init__ and (args or kwargs): + raise TypeError(f"{cls.__name__}() takes no arguments") + else: + return original_new(cls) + + arg.__new__ = staticmethod(__new__) + + if "__init_subclass__" in arg.__dict__: + # __init_subclass__ is directly present on the decorated class. + # Synthesize a wrapper that calls this method directly. + original_init_subclass = arg.__init_subclass__ + # We need slightly different behavior if __init_subclass__ + # is a bound method (likely if it was implemented in Python). + # Otherwise, it likely means it's a builtin such as + # object's implementation of __init_subclass__. + if isinstance(original_init_subclass, MethodType): + original_init_subclass = original_init_subclass.__func__ + + @functools.wraps(original_init_subclass) + def __init_subclass__(*args, **kwargs): + _wm.warn(msg, category=category, stacklevel=stacklevel + 1) + return original_init_subclass(*args, **kwargs) + else: + def __init_subclass__(cls, *args, **kwargs): + _wm.warn(msg, category=category, stacklevel=stacklevel + 1) + return super(arg, cls).__init_subclass__(*args, **kwargs) + + arg.__init_subclass__ = classmethod(__init_subclass__) + + arg.__deprecated__ = __new__.__deprecated__ = msg + __init_subclass__.__deprecated__ = msg + return arg + elif callable(arg): + import functools + import inspect + + @functools.wraps(arg) + def wrapper(*args, **kwargs): + _wm.warn(msg, category=category, stacklevel=stacklevel + 1) + return arg(*args, **kwargs) + + if inspect.iscoroutinefunction(arg): + wrapper = inspect.markcoroutinefunction(wrapper) + + arg.__deprecated__ = wrapper.__deprecated__ = msg + return wrapper + else: + raise TypeError( + "@deprecated decorator with non-None category must be applied to " + f"a class or callable, not {arg!r}" + ) + + +_DEPRECATED_MSG = "{name!r} is deprecated and slated for removal in Python {remove}" + + +def _deprecated(name, message=_DEPRECATED_MSG, *, remove, _version=sys.version_info): + """Warn that *name* is deprecated or should be removed. + + RuntimeError is raised if *remove* specifies a major/minor tuple older than + the current Python version or the same version but past the alpha. + + The *message* argument is formatted with *name* and *remove* as a Python + version tuple (e.g. (3, 11)). + + """ + remove_formatted = f"{remove[0]}.{remove[1]}" + if (_version[:2] > remove) or (_version[:2] == remove and _version[3] != "alpha"): + msg = f"{name!r} was slated for removal after Python {remove_formatted} alpha" + raise RuntimeError(msg) + else: + msg = message.format(name=name, remove=remove_formatted) + _wm.warn(msg, DeprecationWarning, stacklevel=3) + + +# Private utility function called by _PyErr_WarnUnawaitedCoroutine +def _warn_unawaited_coroutine(coro): + msg_lines = [ + f"coroutine '{coro.__qualname__}' was never awaited\n" + ] + if coro.cr_origin is not None: + import linecache, traceback + def extract(): + for filename, lineno, funcname in reversed(coro.cr_origin): + line = linecache.getline(filename, lineno) + yield (filename, lineno, funcname, line) + msg_lines.append("Coroutine created at (most recent call last)\n") + msg_lines += traceback.format_list(list(extract())) + msg = "".join(msg_lines).rstrip("\n") + # Passing source= here means that if the user happens to have tracemalloc + # enabled and tracking where the coroutine was created, the warning will + # contain that traceback. This does mean that if they have *both* + # coroutine origin tracking *and* tracemalloc enabled, they'll get two + # partially-redundant tracebacks. If we wanted to be clever we could + # probably detect this case and avoid it, but for now we don't bother. + _wm.warn( + msg, category=RuntimeWarning, stacklevel=2, source=coro + ) + + +def _setup_defaults(): + # Several warning categories are ignored by default in regular builds + if hasattr(sys, 'gettotalrefcount'): + return + _wm.filterwarnings("default", category=DeprecationWarning, module="__main__", append=1) + _wm.simplefilter("ignore", category=DeprecationWarning, append=1) + _wm.simplefilter("ignore", category=PendingDeprecationWarning, append=1) + _wm.simplefilter("ignore", category=ImportWarning, append=1) + _wm.simplefilter("ignore", category=ResourceWarning, append=1) diff --git a/python/python3_13/examples/_pydatetime.py b/python/python3_14/examples/_pydatetime.py similarity index 92% rename from python/python3_13/examples/_pydatetime.py rename to python/python3_14/examples/_pydatetime.py index 34ccb2da13..70251dbb65 100644 --- a/python/python3_13/examples/_pydatetime.py +++ b/python/python3_14/examples/_pydatetime.py @@ -1,12 +1,10 @@ -"""Concrete date/time and related types. - -See http://www.iana.org/time-zones/repository/tz-link.html for -time zone and DST data sources. -""" +"""Pure Python implementation of the datetime module.""" __all__ = ("date", "datetime", "time", "timedelta", "timezone", "tzinfo", "MINYEAR", "MAXYEAR", "UTC") +__name__ = "datetime" + import time as _time import math as _math @@ -18,10 +16,10 @@ def _cmp(x, y): def _get_class_module(self): module_name = self.__class__.__module__ - if module_name == '_pydatetime': - return 'datetime' + if module_name == 'datetime': + return 'datetime.' else: - return module_name + return '' MINYEAR = 1 MAXYEAR = 9999 @@ -64,14 +62,14 @@ def _days_in_month(year, month): def _days_before_month(year, month): "year, month -> number of days in year preceding first day of month." - assert 1 <= month <= 12, 'month must be in 1..12' + assert 1 <= month <= 12, f"month must be in 1..12, not {month}" return _DAYS_BEFORE_MONTH[month] + (month > 2 and _is_leap(year)) def _ymd2ord(year, month, day): "year, month, day -> ordinal, considering 01-Jan-0001 as day 1." - assert 1 <= month <= 12, 'month must be in 1..12' + assert 1 <= month <= 12, f"month must be in 1..12, not {month}" dim = _days_in_month(year, month) - assert 1 <= day <= dim, ('day must be in 1..%d' % dim) + assert 1 <= day <= dim, f"day must be in 1..{dim}, not {day}" return (_days_before_year(year) + _days_before_month(year, month) + day) @@ -204,6 +202,17 @@ def _format_offset(off, sep=':'): s += '.%06d' % ss.microseconds return s +_normalize_century = None +def _need_normalize_century(): + global _normalize_century + if _normalize_century is None: + try: + _normalize_century = ( + _time.strftime("%Y", (99, 1, 1, 0, 0, 0, 0, 1, 0)) != "0099") + except ValueError: + _normalize_century = True + return _normalize_century + # Correctly substitute for %z and %Z escapes in strftime formats. def _wrap_strftime(object, format, timetuple): # Don't call utcoffset() or tzname() unless actually needed. @@ -261,6 +270,20 @@ def _wrap_strftime(object, format, timetuple): # strftime is going to have at this: escape % Zreplace = s.replace('%', '%%') newformat.append(Zreplace) + # Note that datetime(1000, 1, 1).strftime('%G') == '1000' so + # year 1000 for %G can go on the fast path. + elif ((ch in 'YG' or ch in 'FC') and + object.year < 1000 and _need_normalize_century()): + if ch == 'G': + year = int(_time.strftime("%G", timetuple)) + else: + year = object.year + if ch == 'C': + push('{:02}'.format(year // 100)) + else: + push('{:04}'.format(year)) + if ch == 'F': + push('-{:02}-{:02}'.format(*timetuple[1:3])) else: push('%') push(ch) @@ -399,9 +422,11 @@ def _parse_hh_mm_ss_ff(tstr): if pos < len_str: if tstr[pos] not in '.,': - raise ValueError("Invalid microsecond component") + raise ValueError("Invalid microsecond separator") else: pos += 1 + if not all(map(_is_ascii_digit, tstr[pos:])): + raise ValueError("Non-digit values in fraction") len_remainder = len_str - pos @@ -413,9 +438,6 @@ def _parse_hh_mm_ss_ff(tstr): time_comps[3] = int(tstr[pos:(pos+to_parse)]) if to_parse < 6: time_comps[3] *= _FRACTION_CORRECTION[to_parse-1] - if (len_remainder > to_parse - and not all(map(_is_ascii_digit, tstr[(pos+to_parse):]))): - raise ValueError("Non-digit values in unparsed fraction") return time_comps @@ -431,6 +453,17 @@ def _parse_isoformat_time(tstr): time_comps = _parse_hh_mm_ss_ff(timestr) + hour, minute, second, microsecond = time_comps + became_next_day = False + error_from_components = False + if (hour == 24): + if all(time_comp == 0 for time_comp in time_comps[1:]): + hour = 0 + time_comps[0] = hour + became_next_day = True + else: + error_from_components = True + tzi = None if tz_pos == len_str and tstr[-1] == 'Z': tzi = timezone.utc @@ -446,7 +479,7 @@ def _parse_isoformat_time(tstr): # HH:MM:SS len: 8 # HH:MM:SS.f+ len: 10+ - if len(tzstr) in (0, 1, 3): + if len(tzstr) in (0, 1, 3) or tstr[tz_pos-1] == 'Z': raise ValueError("Malformed time zone string") tz_comps = _parse_hh_mm_ss_ff(tzstr) @@ -463,13 +496,13 @@ def _parse_isoformat_time(tstr): time_comps.append(tzi) - return time_comps + return time_comps, became_next_day, error_from_components # tuple[int, int, int] -> tuple[int, int, int] version of date.fromisocalendar def _isoweek_to_gregorian(year, week, day): # Year is bounded this way because 9999-12-31 is (9999, 52, 5) if not MINYEAR <= year <= MAXYEAR: - raise ValueError(f"Year is out of range: {year}") + raise ValueError(f"year must be in {MINYEAR}..{MAXYEAR}, not {year}") if not 0 < week < 53: out_of_range = True @@ -502,7 +535,7 @@ def _isoweek_to_gregorian(year, week, day): def _check_tzname(name): if name is not None and not isinstance(name, str): raise TypeError("tzinfo.tzname() must return None or string, " - "not '%s'" % type(name)) + f"not {type(name).__name__!r}") # name is the offset-producing method, "utcoffset" or "dst". # offset is what it returned. @@ -515,24 +548,24 @@ def _check_utc_offset(name, offset): if offset is None: return if not isinstance(offset, timedelta): - raise TypeError("tzinfo.%s() must return None " - "or timedelta, not '%s'" % (name, type(offset))) + raise TypeError(f"tzinfo.{name}() must return None " + f"or timedelta, not {type(offset).__name__!r}") if not -timedelta(1) < offset < timedelta(1): - raise ValueError("%s()=%s, must be strictly between " - "-timedelta(hours=24) and timedelta(hours=24)" % - (name, offset)) + raise ValueError("offset must be a timedelta " + "strictly between -timedelta(hours=24) and " + f"timedelta(hours=24), not {offset!r}") def _check_date_fields(year, month, day): year = _index(year) month = _index(month) day = _index(day) if not MINYEAR <= year <= MAXYEAR: - raise ValueError('year must be in %d..%d' % (MINYEAR, MAXYEAR), year) + raise ValueError(f"year must be in {MINYEAR}..{MAXYEAR}, not {year}") if not 1 <= month <= 12: - raise ValueError('month must be in 1..12', month) + raise ValueError(f"month must be in 1..12, not {month}") dim = _days_in_month(year, month) if not 1 <= day <= dim: - raise ValueError('day must be in 1..%d' % dim, day) + raise ValueError(f"day {day} must be in range 1..{dim} for month {month} in year {year}") return year, month, day def _check_time_fields(hour, minute, second, microsecond, fold): @@ -541,20 +574,23 @@ def _check_time_fields(hour, minute, second, microsecond, fold): second = _index(second) microsecond = _index(microsecond) if not 0 <= hour <= 23: - raise ValueError('hour must be in 0..23', hour) + raise ValueError(f"hour must be in 0..23, not {hour}") if not 0 <= minute <= 59: - raise ValueError('minute must be in 0..59', minute) + raise ValueError(f"minute must be in 0..59, not {minute}") if not 0 <= second <= 59: - raise ValueError('second must be in 0..59', second) + raise ValueError(f"second must be in 0..59, not {second}") if not 0 <= microsecond <= 999999: - raise ValueError('microsecond must be in 0..999999', microsecond) + raise ValueError(f"microsecond must be in 0..999999, not {microsecond}") if fold not in (0, 1): - raise ValueError('fold must be either 0 or 1', fold) + raise ValueError(f"fold must be either 0 or 1, not {fold}") return hour, minute, second, microsecond, fold def _check_tzinfo_arg(tz): if tz is not None and not isinstance(tz, tzinfo): - raise TypeError("tzinfo argument must be None or of a tzinfo subclass") + raise TypeError( + "tzinfo argument must be None or of a tzinfo subclass, " + f"not {type(tz).__name__!r}" + ) def _divide_and_round(a, b): """divide a by b and round result to the nearest integer @@ -608,7 +644,19 @@ def __new__(cls, days=0, seconds=0, microseconds=0, # guide the C implementation; it's way more convoluted than speed- # ignoring auto-overflow-to-long idiomatic Python could be. - # XXX Check that all inputs are ints or floats. + for name, value in ( + ("days", days), + ("seconds", seconds), + ("microseconds", microseconds), + ("milliseconds", milliseconds), + ("minutes", minutes), + ("hours", hours), + ("weeks", weeks) + ): + if not isinstance(value, (int, float)): + raise TypeError( + f"unsupported type for timedelta {name} component: {type(value).__name__}" + ) # Final values, all integer. # s and us fit in 32-bit signed ints; d isn't bounded. @@ -709,9 +757,9 @@ def __repr__(self): args.append("microseconds=%d" % self._microseconds) if not args: args.append('0') - return "%s.%s(%s)" % (_get_class_module(self), - self.__class__.__qualname__, - ', '.join(args)) + return "%s%s(%s)" % (_get_class_module(self), + self.__class__.__qualname__, + ', '.join(args)) def __str__(self): mm, ss = divmod(self._seconds, 60) @@ -908,6 +956,7 @@ class date: fromtimestamp() today() fromordinal() + strptime() Operators: @@ -990,8 +1039,12 @@ def fromordinal(cls, n): @classmethod def fromisoformat(cls, date_string): """Construct a date from a string in ISO 8601 format.""" + if not isinstance(date_string, str): - raise TypeError('fromisoformat: argument must be str') + raise TypeError('Argument must be a str') + + if not date_string.isascii(): + raise ValueError('Argument must be an ASCII str') if len(date_string) not in (7, 8, 10): raise ValueError(f'Invalid isoformat string: {date_string!r}') @@ -1008,6 +1061,12 @@ def fromisocalendar(cls, year, week, day): This is the inverse of the date.isocalendar() function""" return cls(*_isoweek_to_gregorian(year, week, day)) + @classmethod + def strptime(cls, date_string, format): + """Parse a date string according to the given format (like time.strptime()).""" + import _strptime + return _strptime._strptime_datetime_date(cls, date_string, format) + # Conversions to string def __repr__(self): @@ -1017,11 +1076,11 @@ def __repr__(self): >>> repr(d) 'datetime.date(2010, 1, 1)' """ - return "%s.%s(%d, %d, %d)" % (_get_class_module(self), - self.__class__.__qualname__, - self._year, - self._month, - self._day) + return "%s%s(%d, %d, %d)" % (_get_class_module(self), + self.__class__.__qualname__, + self._year, + self._month, + self._day) # XXX These shouldn't depend on time.localtime(), because that # clips the usable dates to [1970 .. 2038). At least ctime() is # easily done without using strftime() -- that's better too because @@ -1057,8 +1116,8 @@ def isoformat(self): This is 'YYYY-MM-DD'. References: - - http://www.w3.org/TR/NOTE-datetime - - http://www.cl.cam.ac.uk/~mgk25/iso-time.html + - https://www.w3.org/TR/NOTE-datetime + - https://www.cl.cam.ac.uk/~mgk25/iso-time.html """ return "%04d-%02d-%02d" % (self._year, self._month, self._day) @@ -1192,7 +1251,7 @@ def isocalendar(self): The first week is 1; Monday is 1 ... Sunday is 7. ISO calendar algorithm taken from - http://www.phys.uu.nl/~vgent/calendar/isocalendar.htm + https://www.phys.uu.nl/~vgent/calendar/isocalendar.htm (used with permission) """ year = self._year @@ -1328,6 +1387,7 @@ class time: Constructors: __new__() + strptime() Operators: @@ -1386,6 +1446,12 @@ def __new__(cls, hour=0, minute=0, second=0, microsecond=0, tzinfo=None, *, fold self._fold = fold return self + @classmethod + def strptime(cls, date_string, format): + """string, format -> new time parsed from a string (like time.strptime()).""" + import _strptime + return _strptime._strptime_datetime_time(cls, date_string, format) + # Read-only field accessors @property def hour(self): @@ -1514,7 +1580,7 @@ def __repr__(self): s = ", %d" % self._second else: s = "" - s= "%s.%s(%d, %d%s)" % (_get_class_module(self), + s = "%s%s(%d, %d%s)" % (_get_class_module(self), self.__class__.__qualname__, self._hour, self._minute, s) if self._tzinfo is not None: @@ -1556,7 +1622,7 @@ def fromisoformat(cls, time_string): time_string = time_string.removeprefix('T') try: - return cls(*_parse_isoformat_time(time_string)) + return cls(*_parse_isoformat_time(time_string)[0]) except Exception: raise ValueError(f'Invalid isoformat string: {time_string!r}') @@ -1870,10 +1936,27 @@ def fromisoformat(cls, date_string): if tstr: try: - time_components = _parse_isoformat_time(tstr) + time_components, became_next_day, error_from_components = _parse_isoformat_time(tstr) except ValueError: raise ValueError( f'Invalid isoformat string: {date_string!r}') from None + else: + if error_from_components: + raise ValueError("minute, second, and microsecond must be 0 when hour is 24") + + if became_next_day: + year, month, day = date_components + # Only wrap day/month when it was previously valid + if month <= 12 and day <= (days_in_month := _days_in_month(year, month)): + # Calculate midnight of the next day + day += 1 + if day > days_in_month: + day = 1 + month += 1 + if month > 12: + month = 1 + year += 1 + date_components = [year, month, day] else: time_components = [0, 0, 0, 0, None] @@ -2045,7 +2128,7 @@ def isoformat(self, sep='T', timespec='auto'): By default, the fractional part is omitted if self.microsecond == 0. If self.tzinfo is not None, the UTC offset is also attached, giving - giving a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmm+HH:MM'. + a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmm+HH:MM'. Optional argument sep specifies the separator between date and time, default 'T'. @@ -2073,9 +2156,9 @@ def __repr__(self): del L[-1] if L[-1] == 0: del L[-1] - s = "%s.%s(%s)" % (_get_class_module(self), - self.__class__.__qualname__, - ", ".join(map(str, L))) + s = "%s%s(%s)" % (_get_class_module(self), + self.__class__.__qualname__, + ", ".join(map(str, L))) if self._tzinfo is not None: assert s[-1:] == ")" s = s[:-1] + ", tzinfo=%r" % self._tzinfo + ")" @@ -2092,7 +2175,7 @@ def __str__(self): def strptime(cls, date_string, format): 'string, format -> new datetime parsed from a string (like time.strptime()).' import _strptime - return _strptime._strptime_datetime(cls, date_string, format) + return _strptime._strptime_datetime_datetime(cls, date_string, format) def utcoffset(self): """Return the timezone offset as timedelta positive east of UTC (negative west of @@ -2306,7 +2389,6 @@ def __reduce__(self): def _isoweek1monday(year): # Helper to calculate the day number of the Monday starting week 1 - # XXX This could be done more efficiently THURSDAY = 3 firstday = _ymd2ord(year, 1, 1) firstweekday = (firstday + 6) % 7 # See weekday() above @@ -2333,7 +2415,7 @@ def __new__(cls, offset, name=_Omitted): if not cls._minoffset <= offset <= cls._maxoffset: raise ValueError("offset must be a timedelta " "strictly between -timedelta(hours=24) and " - "timedelta(hours=24).") + f"timedelta(hours=24), not {offset!r}") return cls._create(offset, name) def __init_subclass__(cls): @@ -2373,12 +2455,12 @@ def __repr__(self): if self is self.utc: return 'datetime.timezone.utc' if self._name is None: - return "%s.%s(%r)" % (_get_class_module(self), - self.__class__.__qualname__, - self._offset) - return "%s.%s(%r, %r)" % (_get_class_module(self), - self.__class__.__qualname__, - self._offset, self._name) + return "%s%s(%r)" % (_get_class_module(self), + self.__class__.__qualname__, + self._offset) + return "%s%s(%r, %r)" % (_get_class_module(self), + self.__class__.__qualname__, + self._offset, self._name) def __str__(self): return self.tzname(None) diff --git a/python/python3_13/examples/_pydecimal.py b/python/python3_14/examples/_pydecimal.py similarity index 98% rename from python/python3_13/examples/_pydecimal.py rename to python/python3_14/examples/_pydecimal.py index 75df3db262..97a629fe92 100644 --- a/python/python3_13/examples/_pydecimal.py +++ b/python/python3_14/examples/_pydecimal.py @@ -38,10 +38,10 @@ 'ROUND_FLOOR', 'ROUND_UP', 'ROUND_HALF_DOWN', 'ROUND_05UP', # Functions for manipulating contexts - 'setcontext', 'getcontext', 'localcontext', + 'setcontext', 'getcontext', 'localcontext', 'IEEEContext', # Limits for the C version for compatibility - 'MAX_PREC', 'MAX_EMAX', 'MIN_EMIN', 'MIN_ETINY', + 'MAX_PREC', 'MAX_EMAX', 'MIN_EMIN', 'MIN_ETINY', 'IEEE_CONTEXT_MAX_BITS', # C version: compile time choice that enables the thread local context (deprecated, now always true) 'HAVE_THREADS', @@ -83,10 +83,12 @@ MAX_PREC = 999999999999999999 MAX_EMAX = 999999999999999999 MIN_EMIN = -999999999999999999 + IEEE_CONTEXT_MAX_BITS = 512 else: MAX_PREC = 425000000 MAX_EMAX = 425000000 MIN_EMIN = -425000000 + IEEE_CONTEXT_MAX_BITS = 256 MIN_ETINY = MIN_EMIN - (MAX_PREC-1) @@ -97,7 +99,7 @@ class DecimalException(ArithmeticError): Used exceptions derive from this. If an exception derives from another exception besides this (such as - Underflow (Inexact, Rounded, Subnormal) that indicates that it is only + Underflow (Inexact, Rounded, Subnormal)) that indicates that it is only called if the others are present. This isn't actually used for anything, though. @@ -145,7 +147,7 @@ class InvalidOperation(DecimalException): x ** (+-)INF An operand is invalid - The result of the operation after these is a quiet positive NaN, + The result of the operation after this is a quiet positive NaN, except when the cause is a signaling NaN, in which case the result is also a quiet NaN, but with the original sign, and an optional diagnostic information. @@ -417,6 +419,27 @@ def sin(x): return ctx_manager +def IEEEContext(bits, /): + """ + Return a context object initialized to the proper values for one of the + IEEE interchange formats. The argument must be a multiple of 32 and less + than IEEE_CONTEXT_MAX_BITS. + """ + if bits <= 0 or bits > IEEE_CONTEXT_MAX_BITS or bits % 32: + raise ValueError("argument must be a multiple of 32, " + f"with a maximum of {IEEE_CONTEXT_MAX_BITS}") + + ctx = Context() + ctx.prec = 9 * (bits//32) - 2 + ctx.Emax = 3 * (1 << (bits//16 + 3)) + ctx.Emin = 1 - ctx.Emax + ctx.rounding = ROUND_HALF_EVEN + ctx.clamp = 1 + ctx.traps = dict.fromkeys(_signals, False) + + return ctx + + ##### Decimal class ####################################################### # Do not subclass Decimal from numbers.Real and do not register it as such @@ -582,6 +605,21 @@ def __new__(cls, value="0", context=None): raise TypeError("Cannot convert %r to Decimal" % value) + @classmethod + def from_number(cls, number): + """Converts a real number to a decimal number, exactly. + + >>> Decimal.from_number(314) # int + Decimal('314') + >>> Decimal.from_number(0.1) # float + Decimal('0.1000000000000000055511151231257827021181583404541015625') + >>> Decimal.from_number(Decimal('3.14')) # another decimal instance + Decimal('3.14') + """ + if isinstance(number, (int, Decimal, float)): + return cls(number) + raise TypeError("Cannot convert %r to Decimal" % number) + @classmethod def from_float(cls, f): """Converts a float to a decimal number, exactly. @@ -2425,12 +2463,12 @@ def __pow__(self, other, modulo=None, context=None): return ans - def __rpow__(self, other, context=None): + def __rpow__(self, other, modulo=None, context=None): """Swaps self/other and returns __pow__.""" other = _convert_other(other) if other is NotImplemented: return other - return other.__pow__(self, context=context) + return other.__pow__(self, modulo, context=context) def normalize(self, context=None): """Normalize- strip trailing 0s, change anything equal to 0 to 0e0""" @@ -3302,7 +3340,10 @@ def _fill_logical(self, context, opa, opb): return opa, opb def logical_and(self, other, context=None): - """Applies an 'and' operation between self and other's digits.""" + """Applies an 'and' operation between self and other's digits. + + Both self and other must be logical numbers. + """ if context is None: context = getcontext() @@ -3319,14 +3360,20 @@ def logical_and(self, other, context=None): return _dec_from_triple(0, result.lstrip('0') or '0', 0) def logical_invert(self, context=None): - """Invert all its digits.""" + """Invert all its digits. + + The self must be logical number. + """ if context is None: context = getcontext() return self.logical_xor(_dec_from_triple(0,'1'*context.prec,0), context) def logical_or(self, other, context=None): - """Applies an 'or' operation between self and other's digits.""" + """Applies an 'or' operation between self and other's digits. + + Both self and other must be logical numbers. + """ if context is None: context = getcontext() @@ -3343,7 +3390,10 @@ def logical_or(self, other, context=None): return _dec_from_triple(0, result.lstrip('0') or '0', 0) def logical_xor(self, other, context=None): - """Applies an 'xor' operation between self and other's digits.""" + """Applies an 'xor' operation between self and other's digits. + + Both self and other must be logical numbers. + """ if context is None: context = getcontext() @@ -6058,7 +6108,7 @@ def _convert_for_comparison(self, other, equality_op=False): (?P\d*) # with (possibly empty) diagnostic info. ) # \s* - \Z + \z """, re.VERBOSE | re.IGNORECASE).match _all_zeros = re.compile('0*$').match @@ -6082,11 +6132,15 @@ def _convert_for_comparison(self, other, equality_op=False): (?Pz)? (?P\#)? (?P0)? -(?P(?!0)\d+)? -(?P,)? -(?:\.(?P0|(?!0)\d+))? +(?P\d+)? +(?P[,_])? +(?:\. + (?=[\d,_]) # lookahead for digit or separator + (?P\d+)? + (?P[,_])? +)? (?P[eEfFgGn%])? -\Z +\z """, re.VERBOSE|re.DOTALL) del re @@ -6177,6 +6231,9 @@ def _parse_format_specifier(format_spec, _localeconv=None): format_dict['grouping'] = [3, 0] format_dict['decimal_point'] = '.' + if format_dict['frac_separators'] is None: + format_dict['frac_separators'] = '' + return format_dict def _format_align(sign, body, spec): @@ -6296,6 +6353,11 @@ def _format_number(is_negative, intpart, fracpart, exp, spec): sign = _format_sign(is_negative, spec) + frac_sep = spec['frac_separators'] + if fracpart and frac_sep: + fracpart = frac_sep.join(fracpart[pos:pos + 3] + for pos in range(0, len(fracpart), 3)) + if fracpart or spec['alt']: fracpart = spec['decimal_point'] + fracpart diff --git a/python/python3_13/examples/_pyio.py b/python/python3_14/examples/_pyio.py similarity index 94% rename from python/python3_13/examples/_pyio.py rename to python/python3_14/examples/_pyio.py index a3fede6992..612e4a175e 100644 --- a/python/python3_13/examples/_pyio.py +++ b/python/python3_14/examples/_pyio.py @@ -16,15 +16,16 @@ _setmode = None import io -from io import (__all__, SEEK_SET, SEEK_CUR, SEEK_END) +from io import (__all__, SEEK_SET, SEEK_CUR, SEEK_END, Reader, Writer) # noqa: F401 valid_seek_flags = {0, 1, 2} # Hardwired values if hasattr(os, 'SEEK_HOLE') : valid_seek_flags.add(os.SEEK_HOLE) valid_seek_flags.add(os.SEEK_DATA) -# open() uses st_blksize whenever we can -DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes +# open() uses max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE) +# when the device block size is available. +DEFAULT_BUFFER_SIZE = 128 * 1024 # bytes # NOTE: Base classes defined here are registered with the "official" ABCs # defined in io.py. We don't use real inheritance though, because we don't want @@ -123,10 +124,10 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, the size of a fixed-size chunk buffer. When no buffering argument is given, the default buffering policy works as follows: - * Binary files are buffered in fixed-size chunks; the size of the buffer - is chosen using a heuristic trying to determine the underlying device's - "block size" and falling back on `io.DEFAULT_BUFFER_SIZE`. - On many systems, the buffer will typically be 4096 or 8192 bytes long. + * Binary files are buffered in fixed-size chunks; the size of the buffer + is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE) + when the device block size is available. + On most systems, the buffer will typically be 128 kilobytes long. * "Interactive" text files (files for which isatty() returns True) use line buffering. Other text files use the policy described above @@ -238,18 +239,11 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, result = raw try: line_buffering = False - if buffering == 1 or buffering < 0 and raw.isatty(): + if buffering == 1 or buffering < 0 and raw._isatty_open_only(): buffering = -1 line_buffering = True if buffering < 0: - buffering = DEFAULT_BUFFER_SIZE - try: - bs = os.fstat(raw.fileno()).st_blksize - except (OSError, AttributeError): - pass - else: - if bs > 1: - buffering = bs + buffering = max(min(raw._blksize, 8192 * 1024), DEFAULT_BUFFER_SIZE) if buffering < 0: raise ValueError("invalid buffering size") if buffering == 0: @@ -413,6 +407,9 @@ def __del__(self): if closed: return + if dealloc_warn := getattr(self, "_dealloc_warn", None): + dealloc_warn(self) + # If close() fails, the caller logs the exception with # sys.unraisablehook. close() must be called at the end at __del__(). self.close() @@ -620,6 +617,8 @@ def read(self, size=-1): n = self.readinto(b) if n is None: return None + if n < 0 or n > len(b): + raise ValueError(f"readinto returned {n} outside buffer size {len(b)}") del b[n:] return bytes(b) @@ -651,8 +650,6 @@ def write(self, b): self._unsupported("write") io.RawIOBase.register(RawIOBase) -from _io import FileIO -RawIOBase.register(FileIO) class BufferedIOBase(IOBase): @@ -859,6 +856,10 @@ def __repr__(self): else: return "<{}.{} name={!r}>".format(modname, clsname, name) + def _dealloc_warn(self, source): + if dealloc_warn := getattr(self.raw, "_dealloc_warn", None): + dealloc_warn(source) + ### Lower-level APIs ### def fileno(self): @@ -944,10 +945,8 @@ def write(self, b): return 0 pos = self._pos if pos > len(self._buffer): - # Inserts null bytes between the current end of the file - # and the new write position. - padding = b'\x00' * (pos - len(self._buffer)) - self._buffer += padding + # Pad buffer to pos with null bytes. + self._buffer.resize(pos) self._buffer[pos:pos + n] = b self._pos += n return n @@ -1463,6 +1462,17 @@ def write(self, b): return BufferedWriter.write(self, b) +def _new_buffersize(bytes_read): + # Parallels _io/fileio.c new_buffersize + if bytes_read > 65536: + addend = bytes_read >> 3 + else: + addend = 256 + bytes_read + if addend < DEFAULT_BUFFER_SIZE: + addend = DEFAULT_BUFFER_SIZE + return bytes_read + addend + + class FileIO(RawIOBase): _fd = -1 _created = False @@ -1487,6 +1497,7 @@ def __init__(self, file, mode='r', closefd=True, opener=None): """ if self._fd >= 0: # Have to close the existing file first. + self._stat_atopen = None try: if self._closefd: os.close(self._fd) @@ -1559,24 +1570,22 @@ def __init__(self, file, mode='r', closefd=True, opener=None): if not isinstance(fd, int): raise TypeError('expected integer from opener') if fd < 0: - raise OSError('Negative file descriptor') + # bpo-27066: Raise a ValueError for bad value. + raise ValueError(f'opener returned {fd}') owned_fd = fd if not noinherit_flag: os.set_inheritable(fd, False) self._closefd = closefd - fdfstat = os.fstat(fd) + self._stat_atopen = os.fstat(fd) try: - if stat.S_ISDIR(fdfstat.st_mode): + if stat.S_ISDIR(self._stat_atopen.st_mode): raise IsADirectoryError(errno.EISDIR, os.strerror(errno.EISDIR), file) except AttributeError: # Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR # don't exist. pass - self._blksize = getattr(fdfstat, 'st_blksize', 0) - if self._blksize <= 1: - self._blksize = DEFAULT_BUFFER_SIZE if _setmode: # don't translate newlines (\r\n <=> \n) @@ -1593,17 +1602,17 @@ def __init__(self, file, mode='r', closefd=True, opener=None): if e.errno != errno.ESPIPE: raise except: + self._stat_atopen = None if owned_fd is not None: os.close(owned_fd) raise self._fd = fd - def __del__(self): + def _dealloc_warn(self, source): if self._fd >= 0 and self._closefd and not self.closed: import warnings - warnings.warn('unclosed file %r' % (self,), ResourceWarning, + warnings.warn(f'unclosed file {source!r}', ResourceWarning, stacklevel=2, source=self) - self.close() def __getstate__(self): raise TypeError(f"cannot pickle {self.__class__.__name__!r} object") @@ -1622,6 +1631,17 @@ def __repr__(self): return ('<%s name=%r mode=%r closefd=%r>' % (class_name, name, self.mode, self._closefd)) + @property + def _blksize(self): + if self._stat_atopen is None: + return DEFAULT_BUFFER_SIZE + + blksize = getattr(self._stat_atopen, "st_blksize", 0) + # WASI sets blsize to 0 + if not blksize: + return DEFAULT_BUFFER_SIZE + return blksize + def _checkReadable(self): if not self._readable: raise UnsupportedOperation('File not open for reading') @@ -1633,7 +1653,13 @@ def _checkWritable(self, msg=None): def read(self, size=None): """Read at most size bytes, returned as bytes. - Only makes one system call, so less data may be returned than requested + If size is less than 0, read all bytes in the file making + multiple read calls. See ``FileIO.readall``. + + Attempts to make only one system call, retrying only per + PEP 475 (EINTR). This means less data may be returned than + requested. + In non-blocking mode, returns None if no data is available. Return an empty bytes object at EOF. """ @@ -1649,45 +1675,57 @@ def read(self, size=None): def readall(self): """Read all data from the file, returned as bytes. - In non-blocking mode, returns as much as is immediately available, - or None if no data is available. Return an empty bytes object at EOF. + Reads until either there is an error or read() returns size 0 + (indicates EOF). If the file is already at EOF, returns an + empty bytes object. + + In non-blocking mode, returns as much data as could be read + before EAGAIN. If no data is available (EAGAIN is returned + before bytes are read) returns None. """ self._checkClosed() self._checkReadable() - bufsize = DEFAULT_BUFFER_SIZE - try: - pos = os.lseek(self._fd, 0, SEEK_CUR) - end = os.fstat(self._fd).st_size - if end >= pos: - bufsize = end - pos + 1 - except OSError: - pass + if self._stat_atopen is None or self._stat_atopen.st_size <= 0: + bufsize = DEFAULT_BUFFER_SIZE + else: + # In order to detect end of file, need a read() of at least 1 + # byte which returns size 0. Oversize the buffer by 1 byte so the + # I/O can be completed with two read() calls (one for all data, one + # for EOF) without needing to resize the buffer. + bufsize = self._stat_atopen.st_size + 1 - result = bytearray() - while True: - if len(result) >= bufsize: - bufsize = len(result) - bufsize += max(bufsize, DEFAULT_BUFFER_SIZE) - n = bufsize - len(result) - try: - chunk = os.read(self._fd, n) - except BlockingIOError: - if result: - break + if self._stat_atopen.st_size > 65536: + try: + pos = os.lseek(self._fd, 0, SEEK_CUR) + if self._stat_atopen.st_size >= pos: + bufsize = self._stat_atopen.st_size - pos + 1 + except OSError: + pass + + result = bytearray(bufsize) + bytes_read = 0 + try: + while n := os.readinto(self._fd, memoryview(result)[bytes_read:]): + bytes_read += n + if bytes_read >= len(result): + result.resize(_new_buffersize(bytes_read)) + except BlockingIOError: + if not bytes_read: return None - if not chunk: # reached the end of the file - break - result += chunk + assert len(result) - bytes_read >= 1, \ + "os.readinto buffer size 0 will result in erroneous EOF / returns 0" + result.resize(bytes_read) return bytes(result) - def readinto(self, b): + def readinto(self, buffer): """Same as RawIOBase.readinto().""" - m = memoryview(b).cast('B') - data = self.read(len(m)) - n = len(data) - m[:n] = data - return n + self._checkClosed() + self._checkReadable() + try: + return os.readinto(self._fd, buffer) + except BlockingIOError: + return None def write(self, b): """Write bytes b to file, return number written. @@ -1737,6 +1775,7 @@ def truncate(self, size=None): if size is None: size = self.tell() os.ftruncate(self._fd, size) + self._stat_atopen = None return size def close(self): @@ -1746,8 +1785,9 @@ def close(self): called more than once without error. """ if not self.closed: + self._stat_atopen = None try: - if self._closefd: + if self._closefd and self._fd >= 0: os.close(self._fd) finally: super().close() @@ -1784,6 +1824,21 @@ def isatty(self): self._checkClosed() return os.isatty(self._fd) + def _isatty_open_only(self): + """Checks whether the file is a TTY using an open-only optimization. + + TTYs are always character devices. If the interpreter knows a file is + not a character device when it would call ``isatty``, can skip that + call. Inside ``open()`` there is a fresh stat result that contains that + information. Use the stat result to skip a system call. Outside of that + context TOCTOU issues (the fd could be arbitrarily modified by + surrounding code). + """ + if (self._stat_atopen is not None + and not stat.S_ISCHR(self._stat_atopen.st_mode)): + return False + return os.isatty(self._fd) + @property def closefd(self): """True if the file descriptor will be closed by close().""" @@ -2008,8 +2063,7 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None, raise ValueError("invalid encoding: %r" % encoding) if not codecs.lookup(encoding)._is_text_encoding: - msg = ("%r is not a text encoding; " - "use codecs.open() to handle arbitrary codecs") + msg = "%r is not a text encoding" raise LookupError(msg % encoding) if errors is None: @@ -2517,9 +2571,12 @@ def read(self, size=None): size = size_index() decoder = self._decoder or self._get_decoder() if size < 0: + chunk = self.buffer.read() + if chunk is None: + raise BlockingIOError("Read returned None.") # Read everything. result = (self._get_decoded_chars() + - decoder.decode(self.buffer.read(), final=True)) + decoder.decode(chunk, final=True)) if self._snapshot is not None: self._set_decoded_chars('') self._snapshot = None @@ -2639,6 +2696,10 @@ def readline(self, size=None): def newlines(self): return self._decoder.newlines if self._decoder else None + def _dealloc_warn(self, source): + if dealloc_warn := getattr(self.buffer, "_dealloc_warn", None): + dealloc_warn(source) + class StringIO(TextIOWrapper): """Text I/O implementation using an in-memory buffer. diff --git a/python/python3_14/examples/_pylong.py b/python/python3_14/examples/_pylong.py new file mode 100644 index 0000000000..be1acd17ce --- /dev/null +++ b/python/python3_14/examples/_pylong.py @@ -0,0 +1,729 @@ +"""Python implementations of some algorithms for use by longobject.c. +The goal is to provide asymptotically faster algorithms that can be +used for operations on integers with many digits. In those cases, the +performance overhead of the Python implementation is not significant +since the asymptotic behavior is what dominates runtime. Functions +provided by this module should be considered private and not part of any +public API. + +Note: for ease of maintainability, please prefer clear code and avoid +"micro-optimizations". This module will only be imported and used for +integers with a huge number of digits. Saving a few microseconds with +tricky or non-obvious code is not worth it. For people looking for +maximum performance, they should use something like gmpy2.""" + +import re +import decimal +try: + import _decimal +except ImportError: + _decimal = None + +# A number of functions have this form, where `w` is a desired number of +# digits in base `base`: +# +# def inner(...w...): +# if w <= LIMIT: +# return something +# lo = w >> 1 +# hi = w - lo +# something involving base**lo, inner(...lo...), j, and inner(...hi...) +# figure out largest w needed +# result = inner(w) +# +# They all had some on-the-fly scheme to cache `base**lo` results for reuse. +# Power is costly. +# +# This routine aims to compute all amd only the needed powers in advance, as +# efficiently as reasonably possible. This isn't trivial, and all the +# on-the-fly methods did needless work in many cases. The driving code above +# changes to: +# +# figure out largest w needed +# mycache = compute_powers(w, base, LIMIT) +# result = inner(w) +# +# and `mycache[lo]` replaces `base**lo` in the inner function. +# +# If an algorithm wants the powers of ceiling(w/2) instead of the floor, +# pass keyword argument `need_hi=True`. +# +# While this does give minor speedups (a few percent at best), the +# primary intent is to simplify the functions using this, by eliminating +# the need for them to craft their own ad-hoc caching schemes. +# +# See code near end of file for a block of code that can be enabled to +# run millions of tests. +def compute_powers(w, base, more_than, *, need_hi=False, show=False): + seen = set() + need = set() + ws = {w} + while ws: + w = ws.pop() # any element is fine to use next + if w in seen or w <= more_than: + continue + seen.add(w) + lo = w >> 1 + hi = w - lo + # only _need_ one here; the other may, or may not, be needed + which = hi if need_hi else lo + need.add(which) + ws.add(which) + if lo != hi: + ws.add(w - which) + + # `need` is the set of exponents needed. To compute them all + # efficiently, possibly add other exponents to `extra`. The goal is + # to ensure that each exponent can be gotten from a smaller one via + # multiplying by the base, squaring it, or squaring and then + # multiplying by the base. + # + # If need_hi is False, this is already the case (w can always be + # gotten from w >> 1 via one of the squaring strategies). But we do + # the work anyway, just in case ;-) + # + # Note that speed is irrelevant. These loops are working on little + # ints (exponents) and go around O(log w) times. The total cost is + # insignificant compared to just one of the bigint multiplies. + cands = need.copy() + extra = set() + while cands: + w = max(cands) + cands.remove(w) + lo = w >> 1 + if lo > more_than and w-1 not in cands and lo not in cands: + extra.add(lo) + cands.add(lo) + assert need_hi or not extra + + d = {} + for n in sorted(need | extra): + lo = n >> 1 + hi = n - lo + if n-1 in d: + if show: + print("* base", end="") + result = d[n-1] * base # cheap! + elif lo in d: + # Multiplying a bigint by itself is about twice as fast + # in CPython provided it's the same object. + if show: + print("square", end="") + result = d[lo] * d[lo] # same object + if hi != lo: + if show: + print(" * base", end="") + assert 2 * lo + 1 == n + result *= base + else: # rare + if show: + print("pow", end='') + result = base ** n + if show: + print(" at", n, "needed" if n in need else "extra") + d[n] = result + + assert need <= d.keys() + if excess := d.keys() - need: + assert need_hi + for n in excess: + del d[n] + return d + +_unbounded_dec_context = decimal.getcontext().copy() +_unbounded_dec_context.prec = decimal.MAX_PREC +_unbounded_dec_context.Emax = decimal.MAX_EMAX +_unbounded_dec_context.Emin = decimal.MIN_EMIN +_unbounded_dec_context.traps[decimal.Inexact] = 1 # sanity check + +def int_to_decimal(n): + """Asymptotically fast conversion of an 'int' to Decimal.""" + + # Function due to Tim Peters. See GH issue #90716 for details. + # https://github.com/python/cpython/issues/90716 + # + # The implementation in longobject.c of base conversion algorithms + # between power-of-2 and non-power-of-2 bases are quadratic time. + # This function implements a divide-and-conquer algorithm that is + # faster for large numbers. Builds an equal decimal.Decimal in a + # "clever" recursive way. If we want a string representation, we + # apply str to _that_. + + from decimal import Decimal as D + BITLIM = 200 + + # Don't bother caching the "lo" mask in this; the time to compute it is + # tiny compared to the multiply. + def inner(n, w): + if w <= BITLIM: + return D(n) + w2 = w >> 1 + hi = n >> w2 + lo = n & ((1 << w2) - 1) + return inner(lo, w2) + inner(hi, w - w2) * w2pow[w2] + + with decimal.localcontext(_unbounded_dec_context): + nbits = n.bit_length() + w2pow = compute_powers(nbits, D(2), BITLIM) + if n < 0: + negate = True + n = -n + else: + negate = False + result = inner(n, nbits) + if negate: + result = -result + return result + +def int_to_decimal_string(n): + """Asymptotically fast conversion of an 'int' to a decimal string.""" + w = n.bit_length() + if w > 450_000 and _decimal is not None: + # It is only usable with the C decimal implementation. + # _pydecimal.py calls str() on very large integers, which in its + # turn calls int_to_decimal_string(), causing very deep recursion. + return str(int_to_decimal(n)) + + # Fallback algorithm for the case when the C decimal module isn't + # available. This algorithm is asymptotically worse than the algorithm + # using the decimal module, but better than the quadratic time + # implementation in longobject.c. + + DIGLIM = 1000 + def inner(n, w): + if w <= DIGLIM: + return str(n) + w2 = w >> 1 + hi, lo = divmod(n, pow10[w2]) + return inner(hi, w - w2) + inner(lo, w2).zfill(w2) + + # The estimation of the number of decimal digits. + # There is no harm in small error. If we guess too large, there may + # be leading 0's that need to be stripped. If we guess too small, we + # may need to call str() recursively for the remaining highest digits, + # which can still potentially be a large integer. This is manifested + # only if the number has way more than 10**15 digits, that exceeds + # the 52-bit physical address limit in both Intel64 and AMD64. + w = int(w * 0.3010299956639812 + 1) # log10(2) + pow10 = compute_powers(w, 5, DIGLIM) + for k, v in pow10.items(): + pow10[k] = v << k # 5**k << k == 5**k * 2**k == 10**k + if n < 0: + n = -n + sign = '-' + else: + sign = '' + s = inner(n, w) + if s[0] == '0' and n: + # If our guess of w is too large, there may be leading 0's that + # need to be stripped. + s = s.lstrip('0') + return sign + s + +def _str_to_int_inner(s): + """Asymptotically fast conversion of a 'str' to an 'int'.""" + + # Function due to Bjorn Martinsson. See GH issue #90716 for details. + # https://github.com/python/cpython/issues/90716 + # + # The implementation in longobject.c of base conversion algorithms + # between power-of-2 and non-power-of-2 bases are quadratic time. + # This function implements a divide-and-conquer algorithm making use + # of Python's built in big int multiplication. Since Python uses the + # Karatsuba algorithm for multiplication, the time complexity + # of this function is O(len(s)**1.58). + + DIGLIM = 2048 + + def inner(a, b): + if b - a <= DIGLIM: + return int(s[a:b]) + mid = (a + b + 1) >> 1 + return (inner(mid, b) + + ((inner(a, mid) * w5pow[b - mid]) + << (b - mid))) + + w5pow = compute_powers(len(s), 5, DIGLIM) + return inner(0, len(s)) + + +# Asymptotically faster version, using the C decimal module. See +# comments at the end of the file. This uses decimal arithmetic to +# convert from base 10 to base 256. The latter is just a string of +# bytes, which CPython can convert very efficiently to a Python int. + +# log of 10 to base 256 with best-possible 53-bit precision. Obtained +# via: +# from mpmath import mp +# mp.prec = 1000 +# print(float(mp.log(10, 256)).hex()) +_LOG_10_BASE_256 = float.fromhex('0x1.a934f0979a371p-2') # about 0.415 + +# _spread is for internal testing. It maps a key to the number of times +# that condition obtained in _dec_str_to_int_inner: +# key 0 - quotient guess was right +# key 1 - quotient had to be boosted by 1, one time +# key 999 - one adjustment wasn't enough, so fell back to divmod +from collections import defaultdict +_spread = defaultdict(int) +del defaultdict + +def _dec_str_to_int_inner(s, *, GUARD=8): + # Yes, BYTELIM is "large". Large enough that CPython will usually + # use the Karatsuba _str_to_int_inner to convert the string. This + # allowed reducing the cutoff for calling _this_ function from 3.5M + # to 2M digits. We could almost certainly do even better by + # fine-tuning this and/or using a larger output base than 256. + BYTELIM = 100_000 + D = decimal.Decimal + result = bytearray() + # See notes at end of file for discussion of GUARD. + assert GUARD > 0 # if 0, `decimal` can blow up - .prec 0 not allowed + + def inner(n, w): + #assert n < D256 ** w # required, but too expensive to check + if w <= BYTELIM: + # XXX Stefan Pochmann discovered that, for 1024-bit ints, + # `int(Decimal)` took 2.5x longer than `int(str(Decimal))`. + # Worse, `int(Decimal) is still quadratic-time for much + # larger ints. So unless/until all that is repaired, the + # seemingly redundant `str(Decimal)` is crucial to speed. + result.extend(int(str(n)).to_bytes(w)) # big-endian default + return + w1 = w >> 1 + w2 = w - w1 + if 0: + # This is maximally clear, but "too slow". `decimal` + # division is asymptotically fast, but we have no way to + # tell it to reuse the high-precision reciprocal it computes + # for pow256[w2], so it has to recompute it over & over & + # over again :-( + hi, lo = divmod(n, pow256[w2][0]) + else: + p256, recip = pow256[w2] + # The integer part will have a number of digits about equal + # to the difference between the log10s of `n` and `pow256` + # (which, since these are integers, is roughly approximated + # by `.adjusted()`). That's the working precision we need, + ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD + hi = +n * +recip # unary `+` chops back to ctx.prec digits + ctx.prec = decimal.MAX_PREC + hi = hi.to_integral_value() # lose the fractional digits + lo = n - hi * p256 + # Because we've been uniformly rounding down, `hi` is a + # lower bound on the correct quotient. + assert lo >= 0 + # Adjust quotient up if needed. It usually isn't. In random + # testing on inputs through 5 billion digit strings, the + # test triggered once in about 200 thousand tries. + count = 0 + if lo >= p256: + count = 1 + lo -= p256 + hi += 1 + if lo >= p256: + # Complete correction via an exact computation. I + # believe it's not possible to get here provided + # GUARD >= 3. It's tested by reducing GUARD below + # that. + count = 999 + hi2, lo = divmod(lo, p256) + hi += hi2 + _spread[count] += 1 + # The assert should always succeed, but way too slow to keep + # enabled. + #assert hi, lo == divmod(n, pow256[w2][0]) + inner(hi, w1) + del hi # at top levels, can free a lot of RAM "early" + inner(lo, w2) + + # How many base 256 digits are needed?. Mathematically, exactly + # floor(log256(int(s))) + 1. There is no cheap way to compute this. + # But we can get an upper bound, and that's necessary for our error + # analysis to make sense. int(s) < 10**len(s), so the log needed is + # < log256(10**len(s)) = len(s) * log256(10). However, using + # finite-precision floating point for this, it's possible that the + # computed value is a little less than the true value. If the true + # value is at - or a little higher than - an integer, we can get an + # off-by-1 error too low. So we add 2 instead of 1 if chopping lost + # a fraction > 0.9. + + # The "WASI" test platform can complain about `len(s)` if it's too + # large to fit in its idea of "an index-sized integer". + lenS = s.__len__() + log_ub = lenS * _LOG_10_BASE_256 + log_ub_as_int = int(log_ub) + w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 0.9) + # And what if we've plain exhausted the limits of HW floats? We + # could compute the log to any desired precision using `decimal`, + # but it's not plausible that anyone will pass a string requiring + # trillions of bytes (unless they're just trying to "break things"). + if w.bit_length() >= 46: + # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double. + raise ValueError(f"cannot convert string of len {lenS} to int") + with decimal.localcontext(_unbounded_dec_context) as ctx: + D256 = D(256) + pow256 = compute_powers(w, D256, BYTELIM, need_hi=True) + rpow256 = compute_powers(w, 1 / D256, BYTELIM, need_hi=True) + # We're going to do inexact, chopped arithmetic, multiplying by + # an approximation to the reciprocal of 256**i. We chop to get a + # lower bound on the true integer quotient. Our approximation is + # a lower bound, the multiplication is chopped too, and + # to_integral_value() is also chopped. + ctx.traps[decimal.Inexact] = 0 + ctx.rounding = decimal.ROUND_DOWN + for k, v in pow256.items(): + # No need to save much more precision in the reciprocal than + # the power of 256 has, plus some guard digits to absorb + # most relevant rounding errors. This is highly significant: + # 1/2**i has the same number of significant decimal digits + # as 5**i, generally over twice the number in 2**i, + ctx.prec = v.adjusted() + GUARD + 1 + # The unary "+" chops the reciprocal back to that precision. + pow256[k] = v, +rpow256[k] + del rpow256 # exact reciprocals no longer needed + ctx.prec = decimal.MAX_PREC + inner(D(s), w) + return int.from_bytes(result) + +def int_from_string(s): + """Asymptotically fast version of PyLong_FromString(), conversion + of a string of decimal digits into an 'int'.""" + # PyLong_FromString() has already removed leading +/-, checked for invalid + # use of underscore characters, checked that string consists of only digits + # and underscores, and stripped leading whitespace. The input can still + # contain underscores and have trailing whitespace. + s = s.rstrip().replace('_', '') + func = _str_to_int_inner + if len(s) >= 2_000_000 and _decimal is not None: + func = _dec_str_to_int_inner + return func(s) + +def str_to_int(s): + """Asymptotically fast version of decimal string to 'int' conversion.""" + # FIXME: this doesn't support the full syntax that int() supports. + m = re.match(r'\s*([+-]?)([0-9_]+)\s*', s) + if not m: + raise ValueError('invalid literal for int() with base 10') + v = int_from_string(m.group(2)) + if m.group(1) == '-': + v = -v + return v + + +# Fast integer division, based on code from Mark Dickinson, fast_div.py +# GH-47701. Additional refinements and optimizations by Bjorn Martinsson. The +# algorithm is due to Burnikel and Ziegler, in their paper "Fast Recursive +# Division". + +_DIV_LIMIT = 4000 + + +def _div2n1n(a, b, n): + """Divide a 2n-bit nonnegative integer a by an n-bit positive integer + b, using a recursive divide-and-conquer algorithm. + + Inputs: + n is a positive integer + b is a positive integer with exactly n bits + a is a nonnegative integer such that a < 2**n * b + + Output: + (q, r) such that a = b*q+r and 0 <= r < b. + + """ + if a.bit_length() - n <= _DIV_LIMIT: + return divmod(a, b) + pad = n & 1 + if pad: + a <<= 1 + b <<= 1 + n += 1 + half_n = n >> 1 + mask = (1 << half_n) - 1 + b1, b2 = b >> half_n, b & mask + q1, r = _div3n2n(a >> n, (a >> half_n) & mask, b, b1, b2, half_n) + q2, r = _div3n2n(r, a & mask, b, b1, b2, half_n) + if pad: + r >>= 1 + return q1 << half_n | q2, r + + +def _div3n2n(a12, a3, b, b1, b2, n): + """Helper function for _div2n1n; not intended to be called directly.""" + if a12 >> n == b1: + q, r = (1 << n) - 1, a12 - (b1 << n) + b1 + else: + q, r = _div2n1n(a12, b1, n) + r = (r << n | a3) - q * b2 + while r < 0: + q -= 1 + r += b + return q, r + + +def _int2digits(a, n): + """Decompose non-negative int a into base 2**n + + Input: + a is a non-negative integer + + Output: + List of the digits of a in base 2**n in little-endian order, + meaning the most significant digit is last. The most + significant digit is guaranteed to be non-zero. + If a is 0 then the output is an empty list. + + """ + a_digits = [0] * ((a.bit_length() + n - 1) // n) + + def inner(x, L, R): + if L + 1 == R: + a_digits[L] = x + return + mid = (L + R) >> 1 + shift = (mid - L) * n + upper = x >> shift + lower = x ^ (upper << shift) + inner(lower, L, mid) + inner(upper, mid, R) + + if a: + inner(a, 0, len(a_digits)) + return a_digits + + +def _digits2int(digits, n): + """Combine base-2**n digits into an int. This function is the + inverse of `_int2digits`. For more details, see _int2digits. + """ + + def inner(L, R): + if L + 1 == R: + return digits[L] + mid = (L + R) >> 1 + shift = (mid - L) * n + return (inner(mid, R) << shift) + inner(L, mid) + + return inner(0, len(digits)) if digits else 0 + + +def _divmod_pos(a, b): + """Divide a non-negative integer a by a positive integer b, giving + quotient and remainder.""" + # Use grade-school algorithm in base 2**n, n = nbits(b) + n = b.bit_length() + a_digits = _int2digits(a, n) + + r = 0 + q_digits = [] + for a_digit in reversed(a_digits): + q_digit, r = _div2n1n((r << n) + a_digit, b, n) + q_digits.append(q_digit) + q_digits.reverse() + q = _digits2int(q_digits, n) + return q, r + + +def int_divmod(a, b): + """Asymptotically fast replacement for divmod, for 'int'. + Its time complexity is O(n**1.58), where n = #bits(a) + #bits(b). + """ + if b == 0: + raise ZeroDivisionError('division by zero') + elif b < 0: + q, r = int_divmod(-a, -b) + return q, -r + elif a < 0: + q, r = int_divmod(~a, b) + return ~q, b + ~r + else: + return _divmod_pos(a, b) + + +# Notes on _dec_str_to_int_inner: +# +# Stefan Pochmann worked up a str->int function that used the decimal +# module to, in effect, convert from base 10 to base 256. This is +# "unnatural", in that it requires multiplying and dividing by large +# powers of 2, which `decimal` isn't naturally suited to. But +# `decimal`'s `*` and `/` are asymptotically superior to CPython's, so +# at _some_ point it could be expected to win. +# +# Alas, the crossover point was too high to be of much real interest. I +# (Tim) then worked on ways to replace its division with multiplication +# by a cached reciprocal approximation instead, fixing up errors +# afterwards. This reduced the crossover point significantly, +# +# I revisited the code, and found ways to improve and simplify it. The +# crossover point is at about 3.4 million digits now. +# +# About .adjusted() +# ----------------- +# Restrict to Decimal values x > 0. We don't use negative numbers in the +# code, and I don't want to have to keep typing, e.g., "absolute value". +# +# For convenience, I'll use `x.a` to mean `x.adjusted()`. x.a doesn't +# look at the digits of x, but instead returns an integer giving x's +# order of magnitude. These are equivalent: +# +# - x.a is the power-of-10 exponent of x's most significant digit. +# - x.a = the infinitely precise floor(log10(x)) +# - x can be written in this form, where f is a real with 1 <= f < 10: +# x = f * 10**x.a +# +# Observation; if x is an integer, len(str(x)) = x.a + 1. +# +# Lemma 1: (x * y).a = x.a + y.a, or one larger +# +# Proof: Write x = f * 10**x.a and y = g * 10**y.a, where f and g are in +# [1, 10). Then x*y = f*g * 10**(x.a + y.a), where 1 <= f*g < 100. If +# f*g < 10, (x*y).a is x.a+y.a. Else divide f*g by 10 to bring it back +# into [1, 10], and add 1 to the exponent to compensate. Then (x*y).a is +# x.a+y.a+1. +# +# Lemma 2: ceiling(log10(x/y)) <= x.a - y.a + 1 +# +# Proof: Express x and y as in Lemma 1. Then x/y = f/g * 10**(x.a - +# y.a), where 1/10 < f/g < 10. If 1 <= f/g, (x/y).a is x.a-y.a. Else +# multiply f/g by 10 to bring it back into [1, 10], and subtract 1 from +# the exponent to compensate. Then (x/y).a is x.a-y.a-1. So the largest +# (x/y).a can be is x.a-y.a. Since that's the floor of log10(x/y). the +# ceiling is at most 1 larger (with equality iff f/g = 1 exactly). +# +# GUARD digits +# ------------ +# We only want the integer part of divisions, so don't need to build +# the full multiplication tree. But using _just_ the number of +# digits expected in the integer part ignores too much. What's left +# out can have a very significant effect on the quotient. So we use +# GUARD additional digits. +# +# The default 8 is more than enough so no more than 1 correction step +# was ever needed for all inputs tried through 2.5 billion digits. In +# fact, I believe 3 guard digits are always enough - but the proof is +# very involved, so better safe than sorry. +# +# Short course: +# +# If prec is the decimal precision in effect, and we're rounding down, +# the result of an operation is exactly equal to the infinitely precise +# result times 1-e for some real e with 0 <= e < 10**(1-prec). In +# +# ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD +# hi = +n * +recip # unary `+` chops to ctx.prec digits +# +# we have 3 visible chopped operations, but there's also a 4th: +# precomputing a truncated `recip` as part of setup. +# +# So the computed product is exactly equal to the true product times +# (1-e1)*(1-e2)*(1-e3)*(1-e4); since the e's are all very small, an +# excellent approximation to the second factor is 1-(e1+e2+e3+e4) (the +# 2nd and higher order terms in the expanded product are too tiny to +# matter). If they're all as large as possible, that's +# +# 1 - 4*10**(1-prec). This, BTW, is all bog-standard FP error analysis. +# +# That implies the computed product is within 1 of the true product +# provided prec >= log10(true_product) + 1.602. +# +# Here are telegraphic details, rephrasing the initial condition in +# equivalent ways, step by step: +# +# prod - prod * (1 - 4*10**(1-prec)) <= 1 +# prod - prod + prod * 4*10**(1-prec)) <= 1 +# prod * 4*10**(1-prec)) <= 1 +# 10**(log10(prod)) * 4*10**(1-prec)) <= 1 +# 4*10**(1-prec+log10(prod))) <= 1 +# 10**(1-prec+log10(prod))) <= 1/4 +# 1-prec+log10(prod) <= log10(1/4) = -0.602 +# -prec <= -1.602 - log10(prod) +# prec >= log10(prod) + 1.602 +# +# The true product is the same as the true ratio n/p256. By Lemma 2 +# above, n.a - p256.a + 1 is an upper bound on the ceiling of +# log10(prod). Then 2 is the ceiling of 1.602. so n.a - p256.a + 3 is an +# upper bound on the right hand side of the inequality. Any prec >= that +# will work. +# +# But since this is just a sketch of a proof ;-), the code uses the +# empirically tested 8 instead of 3. 5 digits more or less makes no +# practical difference to speed - these ints are huge. And while +# increasing GUARD above 3 may not be necessary, every increase cuts the +# percentage of cases that need a correction at all. +# +# On Computing Reciprocals +# ------------------------ +# In general, the exact reciprocals we compute have over twice as many +# significant digits as needed. 1/256**i has the same number of +# significant decimal digits as 5**i. It's a significant waste of RAM +# to store all those unneeded digits. +# +# So we cut exact reciprocals back to the least precision that can +# be needed so that the error analysis above is valid, +# +# [Note: turns out it's very significantly faster to do it this way than +# to compute 1 / 256**i directly to the desired precision, because the +# power method doesn't require division. It's also faster than computing +# (1/256)**i directly to the desired precision - no material division +# there, but `compute_powers()` is much smarter about _how_ to compute +# all the powers needed than repeated applications of `**` - that +# function invokes `**` for at most the few smallest powers needed.] +# +# The hard part is that chopping back to a shorter width occurs +# _outside_ of `inner`. We can't know then what `prec` `inner()` will +# need. We have to pick, for each value of `w2`, the largest possible +# value `prec` can become when `inner()` is working on `w2`. +# +# This is the `prec` inner() uses: +# max(n.a - p256.a, 0) + GUARD +# and what setup uses (renaming its `v` to `p256` - same thing): +# p256.a + GUARD + 1 +# +# We need that the second is always at least as large as the first, +# which is the same as requiring +# +# n.a - 2 * p256.a <= 1 +# +# What's the largest n can be? n < 255**w = 256**(w2 + (w - w2)). The +# worst case in this context is when w ix even. and then w = 2*w2, so +# n < 256**(2*w2) = (256**w2)**2 = p256**2. By Lemma 1, then, n.a +# is at most p256.a + p256.a + 1. +# +# So the most n.a - 2 * p256.a can be is +# p256.a + p256.a + 1 - 2 * p256.a = 1. QED +# +# Note: an earlier version of the code split on floor(e/2) instead of on +# the ceiling. The worst case then is odd `w`, and a more involved proof +# was needed to show that adding 4 (instead of 1) may be necessary. +# Basically because, in that case, n may be up to 256 times larger than +# p256**2. Curiously enough, by splitting on the ceiling instead, +# nothing in any proof here actually depends on the output base (256). + +# Enable for brute-force testing of compute_powers(). This takes about a +# minute, because it tries millions of cases. +if 0: + def consumer(w, limit, need_hi): + seen = set() + need = set() + def inner(w): + if w <= limit: + return + if w in seen: + return + seen.add(w) + lo = w >> 1 + hi = w - lo + need.add(hi if need_hi else lo) + inner(lo) + inner(hi) + inner(w) + exp = compute_powers(w, 1, limit, need_hi=need_hi) + assert exp.keys() == need + + from itertools import chain + for need_hi in (False, True): + for limit in (0, 1, 10, 100, 1_000, 10_000, 100_000): + for w in chain(range(1, 100_000), + (10**i for i in range(5, 30))): + consumer(w, limit, need_hi) diff --git a/python/python3_13/examples/_sitebuiltins.py b/python/python3_14/examples/_sitebuiltins.py similarity index 100% rename from python/python3_13/examples/_sitebuiltins.py rename to python/python3_14/examples/_sitebuiltins.py diff --git a/python/python3_13/examples/_strptime.py b/python/python3_14/examples/_strptime.py similarity index 75% rename from python/python3_13/examples/_strptime.py rename to python/python3_14/examples/_strptime.py index 4c68a6a88e..fc7e369c3d 100644 --- a/python/python3_13/examples/_strptime.py +++ b/python/python3_14/examples/_strptime.py @@ -14,6 +14,7 @@ import time import locale import calendar +import re from re import compile as re_compile from re import sub as re_sub from re import IGNORECASE @@ -41,6 +42,29 @@ def _findall(haystack, needle): yield i i += len(needle) +def _fixmonths(months): + yield from months + # The lower case of 'İ' ('\u0130') is 'i\u0307'. + # The re module only supports 1-to-1 character matching in + # case-insensitive mode. + for s in months: + if 'i\u0307' in s: + yield s.replace('i\u0307', '\u0130') + +lzh_TW_alt_digits = ( + # 〇:一:二:三:四:五:六:七:八:九 + '\u3007', '\u4e00', '\u4e8c', '\u4e09', '\u56db', + '\u4e94', '\u516d', '\u4e03', '\u516b', '\u4e5d', + # 十:十一:十二:十三:十四:十五:十六:十七:十八:十九 + '\u5341', '\u5341\u4e00', '\u5341\u4e8c', '\u5341\u4e09', '\u5341\u56db', + '\u5341\u4e94', '\u5341\u516d', '\u5341\u4e03', '\u5341\u516b', '\u5341\u4e5d', + # 廿:廿一:廿二:廿三:廿四:廿五:廿六:廿七:廿八:廿九 + '\u5eff', '\u5eff\u4e00', '\u5eff\u4e8c', '\u5eff\u4e09', '\u5eff\u56db', + '\u5eff\u4e94', '\u5eff\u516d', '\u5eff\u4e03', '\u5eff\u516b', '\u5eff\u4e5d', + # 卅:卅一 + '\u5345', '\u5345\u4e00') + + class LocaleTime(object): """Stores and handles locale-specific information related to time. @@ -84,6 +108,7 @@ def __init__(self): self.__calc_weekday() self.__calc_month() self.__calc_am_pm() + self.__calc_alt_digits() self.__calc_timezone() self.__calc_date_time() if _getlang() != self.lang: @@ -119,9 +144,43 @@ def __calc_am_pm(self): am_pm.append(time.strftime("%p", time_tuple).lower().strip()) self.am_pm = am_pm + def __calc_alt_digits(self): + # Set self.LC_alt_digits by using time.strftime(). + + # The magic data should contain all decimal digits. + time_tuple = time.struct_time((1998, 1, 27, 10, 43, 56, 1, 27, 0)) + s = time.strftime("%x%X", time_tuple) + if s.isascii(): + # Fast path -- all digits are ASCII. + self.LC_alt_digits = () + return + + digits = ''.join(sorted(set(re.findall(r'\d', s)))) + if len(digits) == 10 and ord(digits[-1]) == ord(digits[0]) + 9: + # All 10 decimal digits from the same set. + if digits.isascii(): + # All digits are ASCII. + self.LC_alt_digits = () + return + + self.LC_alt_digits = [a + b for a in digits for b in digits] + # Test whether the numbers contain leading zero. + time_tuple2 = time.struct_time((2000, 1, 1, 1, 1, 1, 5, 1, 0)) + if self.LC_alt_digits[1] not in time.strftime("%x %X", time_tuple2): + self.LC_alt_digits[:10] = digits + return + + # Either non-Gregorian calendar or non-decimal numbers. + if {'\u4e00', '\u4e03', '\u4e5d', '\u5341', '\u5eff'}.issubset(s): + # lzh_TW + self.LC_alt_digits = lzh_TW_alt_digits + return + + self.LC_alt_digits = None + def __calc_date_time(self): - # Set self.date_time, self.date, & self.time by using - # time.strftime(). + # Set self.LC_date_time, self.LC_date, self.LC_time and + # self.LC_time_ampm by using time.strftime(). # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of # overloaded numbers is minimized. The order in which searches for @@ -129,26 +188,32 @@ def __calc_date_time(self): # possible ambiguity for what something represents. time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0)) time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0)) - replacement_pairs = [ + replacement_pairs = [] + + # Non-ASCII digits + if self.LC_alt_digits or self.LC_alt_digits is None: + for n, d in [(19, '%OC'), (99, '%Oy'), (22, '%OH'), + (44, '%OM'), (55, '%OS'), (17, '%Od'), + (3, '%Om'), (2, '%Ow'), (10, '%OI')]: + if self.LC_alt_digits is None: + s = chr(0x660 + n // 10) + chr(0x660 + n % 10) + replacement_pairs.append((s, d)) + if n < 10: + replacement_pairs.append((s[1], d)) + elif len(self.LC_alt_digits) > n: + replacement_pairs.append((self.LC_alt_digits[n], d)) + else: + replacement_pairs.append((time.strftime(d, time_tuple), d)) + replacement_pairs += [ ('1999', '%Y'), ('99', '%y'), ('22', '%H'), ('44', '%M'), ('55', '%S'), ('76', '%j'), ('17', '%d'), ('03', '%m'), ('3', '%m'), # '3' needed for when no leading zero. ('2', '%w'), ('10', '%I'), - # Non-ASCII digits - ('\u0661\u0669\u0669\u0669', '%Y'), - ('\u0669\u0669', '%Oy'), - ('\u0662\u0662', '%OH'), - ('\u0664\u0664', '%OM'), - ('\u0665\u0665', '%OS'), - ('\u0661\u0667', '%Od'), - ('\u0660\u0663', '%Om'), - ('\u0663', '%Om'), - ('\u0662', '%Ow'), - ('\u0661\u0660', '%OI'), ] + date_time = [] - for directive in ('%c', '%x', '%X'): + for directive in ('%c', '%x', '%X', '%r'): current_format = time.strftime(directive, time_tuple).lower() current_format = current_format.replace('%', '%%') # The month and the day of the week formats are treated specially @@ -172,9 +237,10 @@ def __calc_date_time(self): if tz: current_format = current_format.replace(tz, "%Z") # Transform all non-ASCII digits to digits in range U+0660 to U+0669. - current_format = re_sub(r'\d(?3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", 'f': r"(?P[0-9]{1,6})", - 'H': r"(?P2[0-3]|[0-1]\d|\d)", + 'H': r"(?P2[0-3]|[0-1]\d|\d| \d)", + 'k': r"(?P2[0-3]|[0-1]\d|\d| \d)", 'I': r"(?P1[0-2]|0[1-9]|[1-9]| [1-9])", + 'l': r"(?P1[0-2]|0[1-9]|[1-9]| [1-9])", 'G': r"(?P\d\d\d\d)", 'j': r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])", 'm': r"(?P1[0-2]|0[1-9]|[1-9])", @@ -301,29 +370,60 @@ def __init__(self, locale_time=None): 'V': r"(?P5[0-3]|0[1-9]|[1-4]\d|\d)", # W is set below by using 'U' 'y': r"(?P\d\d)", - #XXX: Does 'Y' need to worry about having less or more than - # 4 digits? 'Y': r"(?P\d\d\d\d)", 'z': r"(?P[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|(?-i:Z))", 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), - 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), - 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), + 'B': self.__seqToRE(_fixmonths(self.locale_time.f_month[1:]), 'B'), + 'b': self.__seqToRE(_fixmonths(self.locale_time.a_month[1:]), 'b'), 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), 'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone for tz in tz_names), 'Z'), '%': '%'} - for d in 'dmyHIMS': - mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d - mapping['Ow'] = r'(?P\d)' + if self.locale_time.LC_alt_digits is None: + for d in 'dmyCHIMS': + mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d + mapping['Ow'] = r'(?P\d)' + else: + mapping.update({ + 'Od': self.__seqToRE(self.locale_time.LC_alt_digits[1:32], 'd', + '3[0-1]|[1-2][0-9]|0[1-9]|[1-9]'), + 'Om': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'm', + '1[0-2]|0[1-9]|[1-9]'), + 'Ow': self.__seqToRE(self.locale_time.LC_alt_digits[:7], 'w', + '[0-6]'), + 'Oy': self.__seqToRE(self.locale_time.LC_alt_digits, 'y', + '[0-9][0-9]'), + 'OC': self.__seqToRE(self.locale_time.LC_alt_digits, 'C', + '[0-9][0-9]'), + 'OH': self.__seqToRE(self.locale_time.LC_alt_digits[:24], 'H', + '2[0-3]|[0-1][0-9]|[0-9]'), + 'OI': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'I', + '1[0-2]|0[1-9]|[1-9]'), + 'OM': self.__seqToRE(self.locale_time.LC_alt_digits[:60], 'M', + '[0-5][0-9]|[0-9]'), + 'OS': self.__seqToRE(self.locale_time.LC_alt_digits[:62], 'S', + '6[0-1]|[0-5][0-9]|[0-9]'), + }) + mapping.update({ + 'e': mapping['d'], + 'Oe': mapping['Od'], + 'P': mapping['p'], + 'Op': mapping['p'], + 'W': mapping['U'].replace('U', 'W'), + }) mapping['W'] = mapping['U'].replace('U', 'W') + base.__init__(mapping) + base.__setitem__('T', self.pattern('%H:%M:%S')) + base.__setitem__('R', self.pattern('%H:%M')) + base.__setitem__('r', self.pattern(self.locale_time.LC_time_ampm)) base.__setitem__('X', self.pattern(self.locale_time.LC_time)) base.__setitem__('x', self.pattern(self.locale_time.LC_date)) base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) - def __seqToRE(self, to_convert, directive): + def __seqToRE(self, to_convert, directive, altregex=None): """Convert a list to a regex string for matching a directive. Want possible matching values to be from longest to shortest. This @@ -339,8 +439,9 @@ def __seqToRE(self, to_convert, directive): else: return '' regex = '|'.join(re_escape(stuff) for stuff in to_convert) - regex = '(?P<%s>%s' % (directive, regex) - return '%s)' % regex + if altregex is not None: + regex += '|' + altregex + return '(?P<%s>%s)' % (directive, regex) def pattern(self, format): """Return regex pattern for the format string. @@ -367,11 +468,11 @@ def repl(m): nonlocal day_of_month_in_format day_of_month_in_format = True return self[format_char] - format = re_sub(r'%(O?.)', repl, format) + format = re_sub(r'%[-_0^#]*[0-9]*([OE]?\\?.?)', repl, format) if day_of_month_in_format and not year_in_format: import warnings warnings.warn("""\ -Parsing dates involving a day of month without a year specified is ambiguious +Parsing dates involving a day of month without a year specified is ambiguous and fails to parse leap day. The default behavior will change in Python 3.15 to either always raise an exception or to use a different default year (TBD). To avoid trouble, add a specific year to the input & format. @@ -441,14 +542,13 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"): # \\, in which case it was a stray % but with a space after it except KeyError as err: bad_directive = err.args[0] - if bad_directive == "\\": - bad_directive = "%" del err + bad_directive = bad_directive.replace('\\s', '') + if not bad_directive: + raise ValueError("stray %% in format '%s'" % format) from None + bad_directive = bad_directive.replace('\\', '', 1) raise ValueError("'%s' is a bad directive in format '%s'" % (bad_directive, format)) from None - # IndexError only occurs when the format string is "%" - except IndexError: - raise ValueError("stray %% in format '%s'" % format) from None _regex_cache[format] = format_regex found = format_regex.match(data_string) if not found: @@ -470,6 +570,15 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"): # values weekday = julian = None found_dict = found.groupdict() + if locale_time.LC_alt_digits: + def parse_int(s): + try: + return locale_time.LC_alt_digits.index(s) + except ValueError: + return int(s) + else: + parse_int = int + for group_key in found_dict.keys(): # Directives not explicitly handled below: # c, x, X @@ -477,30 +586,34 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"): # U, W # worthless without day of the week if group_key == 'y': - year = int(found_dict['y']) - # Open Group specification for strptime() states that a %y - #value in the range of [00, 68] is in the century 2000, while - #[69,99] is in the century 1900 - if year <= 68: - year += 2000 + year = parse_int(found_dict['y']) + if 'C' in found_dict: + century = parse_int(found_dict['C']) + year += century * 100 else: - year += 1900 + # Open Group specification for strptime() states that a %y + #value in the range of [00, 68] is in the century 2000, while + #[69,99] is in the century 1900 + if year <= 68: + year += 2000 + else: + year += 1900 elif group_key == 'Y': year = int(found_dict['Y']) elif group_key == 'G': iso_year = int(found_dict['G']) elif group_key == 'm': - month = int(found_dict['m']) + month = parse_int(found_dict['m']) elif group_key == 'B': month = locale_time.f_month.index(found_dict['B'].lower()) elif group_key == 'b': month = locale_time.a_month.index(found_dict['b'].lower()) elif group_key == 'd': - day = int(found_dict['d']) + day = parse_int(found_dict['d']) elif group_key == 'H': - hour = int(found_dict['H']) + hour = parse_int(found_dict['H']) elif group_key == 'I': - hour = int(found_dict['I']) + hour = parse_int(found_dict['I']) ampm = found_dict.get('p', '').lower() # If there was no AM/PM indicator, we'll treat this like AM if ampm in ('', locale_time.am_pm[0]): @@ -516,9 +629,9 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"): if hour != 12: hour += 12 elif group_key == 'M': - minute = int(found_dict['M']) + minute = parse_int(found_dict['M']) elif group_key == 'S': - second = int(found_dict['S']) + second = parse_int(found_dict['S']) elif group_key == 'f': s = found_dict['f'] # Pad to always return microseconds. @@ -670,18 +783,40 @@ def _strptime_time(data_string, format="%a %b %d %H:%M:%S %Y"): tt = _strptime(data_string, format)[0] return time.struct_time(tt[:time._STRUCT_TM_ITEMS]) -def _strptime_datetime(cls, data_string, format="%a %b %d %H:%M:%S %Y"): - """Return a class cls instance based on the input string and the +def _strptime_datetime_date(cls, data_string, format="%a %b %d %Y"): + """Return a date instance based on the input string and the + format string.""" + tt, _, _ = _strptime(data_string, format) + args = tt[:3] + return cls(*args) + +def _parse_tz(tzname, gmtoff, gmtoff_fraction): + tzdelta = datetime_timedelta(seconds=gmtoff, microseconds=gmtoff_fraction) + if tzname: + return datetime_timezone(tzdelta, tzname) + else: + return datetime_timezone(tzdelta) + +def _strptime_datetime_time(cls, data_string, format="%H:%M:%S"): + """Return a time instance based on the input string and the format string.""" tt, fraction, gmtoff_fraction = _strptime(data_string, format) tzname, gmtoff = tt[-2:] - args = tt[:6] + (fraction,) - if gmtoff is not None: - tzdelta = datetime_timedelta(seconds=gmtoff, microseconds=gmtoff_fraction) - if tzname: - tz = datetime_timezone(tzdelta, tzname) - else: - tz = datetime_timezone(tzdelta) - args += (tz,) + args = tt[3:6] + (fraction,) + if gmtoff is None: + return cls(*args) + else: + tz = _parse_tz(tzname, gmtoff, gmtoff_fraction) + return cls(*args, tz) - return cls(*args) +def _strptime_datetime_datetime(cls, data_string, format="%a %b %d %H:%M:%S %Y"): + """Return a datetime instance based on the input string and the + format string.""" + tt, fraction, gmtoff_fraction = _strptime(data_string, format) + tzname, gmtoff = tt[-2:] + args = tt[:6] + (fraction,) + if gmtoff is None: + return cls(*args) + else: + tz = _parse_tz(tzname, gmtoff, gmtoff_fraction) + return cls(*args, tz) diff --git a/python/python3_13/pom.xml b/python/python3_14/pom.xml similarity index 95% rename from python/python3_13/pom.xml rename to python/python3_14/pom.xml index 82ae939c27..bf0e52aac8 100644 --- a/python/python3_13/pom.xml +++ b/python/python3_14/pom.xml @@ -1,8 +1,8 @@ 4.0.0 - python3-13 + python3-14 jar - Python3.13 grammar + Python3.14 grammar org.antlr.grammars pythonparent From ffd9e67aa34468e593c6271a562655da332aa7d5 Mon Sep 17 00:00:00 2001 From: Robert Einhorn Date: Thu, 25 Dec 2025 23:48:31 +0100 Subject: [PATCH 2/6] Update Python 3.14.2 grammar Updated the Python 3.14.2 grammar and synchronized related lexer/parser base files. This update includes rule refinements, token adjustments, directory restructuring, and cleanup of outdated Python 3.13 artifacts to ensure full alignment with the 3.14.2 specification. Signed-off-by: Robert Einhorn --- python/python3_14/CSharp/PythonLexerBase.cs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/python3_14/CSharp/PythonLexerBase.cs b/python/python3_14/CSharp/PythonLexerBase.cs index 02d87ba3dd..0b3ae6b28e 100644 --- a/python/python3_14/CSharp/PythonLexerBase.cs +++ b/python/python3_14/CSharp/PythonLexerBase.cs @@ -27,6 +27,10 @@ THE SOFTWARE. */ using Antlr4.Runtime; +using System; +using System.Collections.Generic; +using System.IO; + [assembly: CLSCompliant(true)] From a2c67a478f34e328d3ad107a3753a6a5a10b0460 Mon Sep 17 00:00:00 2001 From: Robert Einhorn Date: Fri, 26 Dec 2025 11:00:51 +0100 Subject: [PATCH 3/6] Update README.md --- python/python3_14/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/python3_14/README.md b/python/python3_14/README.md index b5d4f824b0..50cc7d1725 100644 --- a/python/python3_14/README.md +++ b/python/python3_14/README.md @@ -17,7 +17,5 @@ - tokenizing BOM Unicode character at the start of the file so it is skipped in the token stream - moved encoding detection from PythonLexerBase to a separate component -#### [Previous changes](https://github.com/antlr/grammars-v4/blob/master/python/python3_14/changes.md)

- ### Related link: [ANTLR4-parser-for-Python-3.14](https://github.com/RobEin/ANTLR4-parser-for-Python-3.14) \ No newline at end of file From ea1fc62153bfbbf2b1ae700bb2e9d4c9471ff11a Mon Sep 17 00:00:00 2001 From: Robert Einhorn Date: Sat, 27 Dec 2025 16:41:55 +0100 Subject: [PATCH 4/6] update PythonLexerBase --- python/python3_14/JavaScript/PythonLexerBase.js | 2 +- python/python3_14/Python3/PythonLexerBase.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/python3_14/JavaScript/PythonLexerBase.js b/python/python3_14/JavaScript/PythonLexerBase.js index 1aed61866d..53280d051d 100644 --- a/python/python3_14/JavaScript/PythonLexerBase.js +++ b/python/python3_14/JavaScript/PythonLexerBase.js @@ -106,7 +106,7 @@ export default class PythonLexerBase extends Lexer { * @param {string} encodingName - The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token. */ setEncodingName(encodingName) { - this.encodingName = encodingName; + this.#encodingName = encodingName; } nextToken() { // Reading the input stream until EOF is reached diff --git a/python/python3_14/Python3/PythonLexerBase.py b/python/python3_14/Python3/PythonLexerBase.py index c4d4026974..8311f59816 100644 --- a/python/python3_14/Python3/PythonLexerBase.py +++ b/python/python3_14/Python3/PythonLexerBase.py @@ -83,7 +83,7 @@ def set_encoding_name(self, encoding_name: str) -> None: :param encoding_name: The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token. """ - self.encoding_name = encoding_name + self._encodingName = encoding_name def nextToken(self) -> CommonToken: # Reading the input stream until EOF is reached self._check_next_token() From 209a0a556897eb60ccd4a6d8c052ba810008f1c9 Mon Sep 17 00:00:00 2001 From: Robert Einhorn Date: Sat, 27 Dec 2025 21:21:34 +0100 Subject: [PATCH 5/6] Update PythonLexer.g4 --- python/python3_14/PythonLexer.g4 | 833 +++++++++++++++++++++++-------- 1 file changed, 620 insertions(+), 213 deletions(-) diff --git a/python/python3_14/PythonLexer.g4 b/python/python3_14/PythonLexer.g4 index a6ed067c3c..488dd0a919 100644 --- a/python/python3_14/PythonLexer.g4 +++ b/python/python3_14/PythonLexer.g4 @@ -578,363 +578,689 @@ fragment IMAG_NUMBER : (FLOAT_NUMBER | DIGIT_PART) ('j' | 'J'); fragment ID_CONTINUE // for Python 3.14.2 : ID_START | '\u{0030}' .. '\u{0039}' + | '\u{0041}' .. '\u{005A}' + | '\u{005F}' + | '\u{0061}' .. '\u{007A}' + | '\u{00AA}' + | '\u{00B5}' | '\u{00B7}' - | '\u{0300}' .. '\u{036F}' - | '\u{0387}' + | '\u{00BA}' + | '\u{00C0}' .. '\u{00D6}' + | '\u{00D8}' .. '\u{00F6}' + | '\u{00F8}' .. '\u{02C1}' + | '\u{02C6}' .. '\u{02D1}' + | '\u{02E0}' .. '\u{02E4}' + | '\u{02EC}' + | '\u{02EE}' + | '\u{0300}' .. '\u{0374}' + | '\u{0376}' .. '\u{0377}' + | '\u{037B}' .. '\u{037D}' + | '\u{037F}' + | '\u{0386}' .. '\u{038A}' + | '\u{038C}' + | '\u{038E}' .. '\u{03A1}' + | '\u{03A3}' .. '\u{03F5}' + | '\u{03F7}' .. '\u{0481}' | '\u{0483}' .. '\u{0487}' + | '\u{048A}' .. '\u{052F}' + | '\u{0531}' .. '\u{0556}' + | '\u{0559}' + | '\u{0560}' .. '\u{0588}' | '\u{0591}' .. '\u{05BD}' | '\u{05BF}' | '\u{05C1}' .. '\u{05C2}' | '\u{05C4}' .. '\u{05C5}' | '\u{05C7}' + | '\u{05D0}' .. '\u{05EA}' + | '\u{05EF}' .. '\u{05F2}' | '\u{0610}' .. '\u{061A}' - | '\u{064B}' .. '\u{0669}' - | '\u{0670}' - | '\u{06D6}' .. '\u{06DC}' - | '\u{06DF}' .. '\u{06E4}' - | '\u{06E7}' .. '\u{06E8}' - | '\u{06EA}' .. '\u{06ED}' - | '\u{06F0}' .. '\u{06F9}' - | '\u{0711}' - | '\u{0730}' .. '\u{074A}' - | '\u{07A6}' .. '\u{07B0}' - | '\u{07C0}' .. '\u{07C9}' - | '\u{07EB}' .. '\u{07F3}' + | '\u{0620}' .. '\u{0669}' + | '\u{066E}' .. '\u{06D3}' + | '\u{06D5}' .. '\u{06DC}' + | '\u{06DF}' .. '\u{06E8}' + | '\u{06EA}' .. '\u{06FC}' + | '\u{06FF}' + | '\u{0710}' .. '\u{074A}' + | '\u{074D}' .. '\u{07B1}' + | '\u{07C0}' .. '\u{07F5}' + | '\u{07FA}' | '\u{07FD}' - | '\u{0816}' .. '\u{0819}' - | '\u{081B}' .. '\u{0823}' - | '\u{0825}' .. '\u{0827}' - | '\u{0829}' .. '\u{082D}' - | '\u{0859}' .. '\u{085B}' - | '\u{0897}' .. '\u{089F}' - | '\u{08CA}' .. '\u{08E1}' - | '\u{08E3}' .. '\u{0903}' - | '\u{093A}' .. '\u{093C}' - | '\u{093E}' .. '\u{094F}' - | '\u{0951}' .. '\u{0957}' - | '\u{0962}' .. '\u{0963}' + | '\u{0800}' .. '\u{082D}' + | '\u{0840}' .. '\u{085B}' + | '\u{0860}' .. '\u{086A}' + | '\u{0870}' .. '\u{0887}' + | '\u{0889}' .. '\u{088E}' + | '\u{0897}' .. '\u{08E1}' + | '\u{08E3}' .. '\u{0963}' | '\u{0966}' .. '\u{096F}' - | '\u{0981}' .. '\u{0983}' - | '\u{09BC}' - | '\u{09BE}' .. '\u{09C4}' + | '\u{0971}' .. '\u{0983}' + | '\u{0985}' .. '\u{098C}' + | '\u{098F}' .. '\u{0990}' + | '\u{0993}' .. '\u{09A8}' + | '\u{09AA}' .. '\u{09B0}' + | '\u{09B2}' + | '\u{09B6}' .. '\u{09B9}' + | '\u{09BC}' .. '\u{09C4}' | '\u{09C7}' .. '\u{09C8}' - | '\u{09CB}' .. '\u{09CD}' + | '\u{09CB}' .. '\u{09CE}' | '\u{09D7}' - | '\u{09E2}' .. '\u{09E3}' - | '\u{09E6}' .. '\u{09EF}' + | '\u{09DC}' .. '\u{09DD}' + | '\u{09DF}' .. '\u{09E3}' + | '\u{09E6}' .. '\u{09F1}' + | '\u{09FC}' | '\u{09FE}' | '\u{0A01}' .. '\u{0A03}' + | '\u{0A05}' .. '\u{0A0A}' + | '\u{0A0F}' .. '\u{0A10}' + | '\u{0A13}' .. '\u{0A28}' + | '\u{0A2A}' .. '\u{0A30}' + | '\u{0A32}' .. '\u{0A33}' + | '\u{0A35}' .. '\u{0A36}' + | '\u{0A38}' .. '\u{0A39}' | '\u{0A3C}' | '\u{0A3E}' .. '\u{0A42}' | '\u{0A47}' .. '\u{0A48}' | '\u{0A4B}' .. '\u{0A4D}' | '\u{0A51}' - | '\u{0A66}' .. '\u{0A71}' - | '\u{0A75}' + | '\u{0A59}' .. '\u{0A5C}' + | '\u{0A5E}' + | '\u{0A66}' .. '\u{0A75}' | '\u{0A81}' .. '\u{0A83}' - | '\u{0ABC}' - | '\u{0ABE}' .. '\u{0AC5}' + | '\u{0A85}' .. '\u{0A8D}' + | '\u{0A8F}' .. '\u{0A91}' + | '\u{0A93}' .. '\u{0AA8}' + | '\u{0AAA}' .. '\u{0AB0}' + | '\u{0AB2}' .. '\u{0AB3}' + | '\u{0AB5}' .. '\u{0AB9}' + | '\u{0ABC}' .. '\u{0AC5}' | '\u{0AC7}' .. '\u{0AC9}' | '\u{0ACB}' .. '\u{0ACD}' - | '\u{0AE2}' .. '\u{0AE3}' + | '\u{0AD0}' + | '\u{0AE0}' .. '\u{0AE3}' | '\u{0AE6}' .. '\u{0AEF}' - | '\u{0AFA}' .. '\u{0AFF}' + | '\u{0AF9}' .. '\u{0AFF}' | '\u{0B01}' .. '\u{0B03}' - | '\u{0B3C}' - | '\u{0B3E}' .. '\u{0B44}' + | '\u{0B05}' .. '\u{0B0C}' + | '\u{0B0F}' .. '\u{0B10}' + | '\u{0B13}' .. '\u{0B28}' + | '\u{0B2A}' .. '\u{0B30}' + | '\u{0B32}' .. '\u{0B33}' + | '\u{0B35}' .. '\u{0B39}' + | '\u{0B3C}' .. '\u{0B44}' | '\u{0B47}' .. '\u{0B48}' | '\u{0B4B}' .. '\u{0B4D}' | '\u{0B55}' .. '\u{0B57}' - | '\u{0B62}' .. '\u{0B63}' + | '\u{0B5C}' .. '\u{0B5D}' + | '\u{0B5F}' .. '\u{0B63}' | '\u{0B66}' .. '\u{0B6F}' - | '\u{0B82}' + | '\u{0B71}' + | '\u{0B82}' .. '\u{0B83}' + | '\u{0B85}' .. '\u{0B8A}' + | '\u{0B8E}' .. '\u{0B90}' + | '\u{0B92}' .. '\u{0B95}' + | '\u{0B99}' .. '\u{0B9A}' + | '\u{0B9C}' + | '\u{0B9E}' .. '\u{0B9F}' + | '\u{0BA3}' .. '\u{0BA4}' + | '\u{0BA8}' .. '\u{0BAA}' + | '\u{0BAE}' .. '\u{0BB9}' | '\u{0BBE}' .. '\u{0BC2}' | '\u{0BC6}' .. '\u{0BC8}' | '\u{0BCA}' .. '\u{0BCD}' + | '\u{0BD0}' | '\u{0BD7}' | '\u{0BE6}' .. '\u{0BEF}' - | '\u{0C00}' .. '\u{0C04}' - | '\u{0C3C}' - | '\u{0C3E}' .. '\u{0C44}' + | '\u{0C00}' .. '\u{0C0C}' + | '\u{0C0E}' .. '\u{0C10}' + | '\u{0C12}' .. '\u{0C28}' + | '\u{0C2A}' .. '\u{0C39}' + | '\u{0C3C}' .. '\u{0C44}' | '\u{0C46}' .. '\u{0C48}' | '\u{0C4A}' .. '\u{0C4D}' | '\u{0C55}' .. '\u{0C56}' - | '\u{0C62}' .. '\u{0C63}' + | '\u{0C58}' .. '\u{0C5A}' + | '\u{0C5D}' + | '\u{0C60}' .. '\u{0C63}' | '\u{0C66}' .. '\u{0C6F}' - | '\u{0C81}' .. '\u{0C83}' - | '\u{0CBC}' - | '\u{0CBE}' .. '\u{0CC4}' + | '\u{0C80}' .. '\u{0C83}' + | '\u{0C85}' .. '\u{0C8C}' + | '\u{0C8E}' .. '\u{0C90}' + | '\u{0C92}' .. '\u{0CA8}' + | '\u{0CAA}' .. '\u{0CB3}' + | '\u{0CB5}' .. '\u{0CB9}' + | '\u{0CBC}' .. '\u{0CC4}' | '\u{0CC6}' .. '\u{0CC8}' | '\u{0CCA}' .. '\u{0CCD}' | '\u{0CD5}' .. '\u{0CD6}' - | '\u{0CE2}' .. '\u{0CE3}' + | '\u{0CDD}' .. '\u{0CDE}' + | '\u{0CE0}' .. '\u{0CE3}' | '\u{0CE6}' .. '\u{0CEF}' - | '\u{0CF3}' - | '\u{0D00}' .. '\u{0D03}' - | '\u{0D3B}' .. '\u{0D3C}' - | '\u{0D3E}' .. '\u{0D44}' + | '\u{0CF1}' .. '\u{0CF3}' + | '\u{0D00}' .. '\u{0D0C}' + | '\u{0D0E}' .. '\u{0D10}' + | '\u{0D12}' .. '\u{0D44}' | '\u{0D46}' .. '\u{0D48}' - | '\u{0D4A}' .. '\u{0D4D}' - | '\u{0D57}' - | '\u{0D62}' .. '\u{0D63}' + | '\u{0D4A}' .. '\u{0D4E}' + | '\u{0D54}' .. '\u{0D57}' + | '\u{0D5F}' .. '\u{0D63}' | '\u{0D66}' .. '\u{0D6F}' + | '\u{0D7A}' .. '\u{0D7F}' | '\u{0D81}' .. '\u{0D83}' + | '\u{0D85}' .. '\u{0D96}' + | '\u{0D9A}' .. '\u{0DB1}' + | '\u{0DB3}' .. '\u{0DBB}' + | '\u{0DBD}' + | '\u{0DC0}' .. '\u{0DC6}' | '\u{0DCA}' | '\u{0DCF}' .. '\u{0DD4}' | '\u{0DD6}' | '\u{0DD8}' .. '\u{0DDF}' | '\u{0DE6}' .. '\u{0DEF}' | '\u{0DF2}' .. '\u{0DF3}' - | '\u{0E31}' - | '\u{0E33}' .. '\u{0E3A}' - | '\u{0E47}' .. '\u{0E4E}' + | '\u{0E01}' .. '\u{0E3A}' + | '\u{0E40}' .. '\u{0E4E}' | '\u{0E50}' .. '\u{0E59}' - | '\u{0EB1}' - | '\u{0EB3}' .. '\u{0EBC}' + | '\u{0E81}' .. '\u{0E82}' + | '\u{0E84}' + | '\u{0E86}' .. '\u{0E8A}' + | '\u{0E8C}' .. '\u{0EA3}' + | '\u{0EA5}' + | '\u{0EA7}' .. '\u{0EBD}' + | '\u{0EC0}' .. '\u{0EC4}' + | '\u{0EC6}' | '\u{0EC8}' .. '\u{0ECE}' | '\u{0ED0}' .. '\u{0ED9}' + | '\u{0EDC}' .. '\u{0EDF}' + | '\u{0F00}' | '\u{0F18}' .. '\u{0F19}' | '\u{0F20}' .. '\u{0F29}' | '\u{0F35}' | '\u{0F37}' | '\u{0F39}' - | '\u{0F3E}' .. '\u{0F3F}' + | '\u{0F3E}' .. '\u{0F47}' + | '\u{0F49}' .. '\u{0F6C}' | '\u{0F71}' .. '\u{0F84}' - | '\u{0F86}' .. '\u{0F87}' - | '\u{0F8D}' .. '\u{0F97}' + | '\u{0F86}' .. '\u{0F97}' | '\u{0F99}' .. '\u{0FBC}' | '\u{0FC6}' - | '\u{102B}' .. '\u{103E}' - | '\u{1040}' .. '\u{1049}' - | '\u{1056}' .. '\u{1059}' - | '\u{105E}' .. '\u{1060}' - | '\u{1062}' .. '\u{1064}' - | '\u{1067}' .. '\u{106D}' - | '\u{1071}' .. '\u{1074}' - | '\u{1082}' .. '\u{108D}' - | '\u{108F}' .. '\u{109D}' + | '\u{1000}' .. '\u{1049}' + | '\u{1050}' .. '\u{109D}' + | '\u{10A0}' .. '\u{10C5}' + | '\u{10C7}' + | '\u{10CD}' + | '\u{10D0}' .. '\u{10FA}' + | '\u{10FC}' .. '\u{1248}' + | '\u{124A}' .. '\u{124D}' + | '\u{1250}' .. '\u{1256}' + | '\u{1258}' + | '\u{125A}' .. '\u{125D}' + | '\u{1260}' .. '\u{1288}' + | '\u{128A}' .. '\u{128D}' + | '\u{1290}' .. '\u{12B0}' + | '\u{12B2}' .. '\u{12B5}' + | '\u{12B8}' .. '\u{12BE}' + | '\u{12C0}' + | '\u{12C2}' .. '\u{12C5}' + | '\u{12C8}' .. '\u{12D6}' + | '\u{12D8}' .. '\u{1310}' + | '\u{1312}' .. '\u{1315}' + | '\u{1318}' .. '\u{135A}' | '\u{135D}' .. '\u{135F}' | '\u{1369}' .. '\u{1371}' - | '\u{1712}' .. '\u{1715}' - | '\u{1732}' .. '\u{1734}' - | '\u{1752}' .. '\u{1753}' + | '\u{1380}' .. '\u{138F}' + | '\u{13A0}' .. '\u{13F5}' + | '\u{13F8}' .. '\u{13FD}' + | '\u{1401}' .. '\u{166C}' + | '\u{166F}' .. '\u{167F}' + | '\u{1681}' .. '\u{169A}' + | '\u{16A0}' .. '\u{16EA}' + | '\u{16EE}' .. '\u{16F8}' + | '\u{1700}' .. '\u{1715}' + | '\u{171F}' .. '\u{1734}' + | '\u{1740}' .. '\u{1753}' + | '\u{1760}' .. '\u{176C}' + | '\u{176E}' .. '\u{1770}' | '\u{1772}' .. '\u{1773}' - | '\u{17B4}' .. '\u{17D3}' - | '\u{17DD}' + | '\u{1780}' .. '\u{17D3}' + | '\u{17D7}' + | '\u{17DC}' .. '\u{17DD}' | '\u{17E0}' .. '\u{17E9}' | '\u{180B}' .. '\u{180D}' | '\u{180F}' .. '\u{1819}' - | '\u{18A9}' + | '\u{1820}' .. '\u{1878}' + | '\u{1880}' .. '\u{18AA}' + | '\u{18B0}' .. '\u{18F5}' + | '\u{1900}' .. '\u{191E}' | '\u{1920}' .. '\u{192B}' | '\u{1930}' .. '\u{193B}' - | '\u{1946}' .. '\u{194F}' + | '\u{1946}' .. '\u{196D}' + | '\u{1970}' .. '\u{1974}' + | '\u{1980}' .. '\u{19AB}' + | '\u{19B0}' .. '\u{19C9}' | '\u{19D0}' .. '\u{19DA}' - | '\u{1A17}' .. '\u{1A1B}' - | '\u{1A55}' .. '\u{1A5E}' + | '\u{1A00}' .. '\u{1A1B}' + | '\u{1A20}' .. '\u{1A5E}' | '\u{1A60}' .. '\u{1A7C}' | '\u{1A7F}' .. '\u{1A89}' | '\u{1A90}' .. '\u{1A99}' + | '\u{1AA7}' | '\u{1AB0}' .. '\u{1ABD}' | '\u{1ABF}' .. '\u{1ACE}' - | '\u{1B00}' .. '\u{1B04}' - | '\u{1B34}' .. '\u{1B44}' + | '\u{1B00}' .. '\u{1B4C}' | '\u{1B50}' .. '\u{1B59}' | '\u{1B6B}' .. '\u{1B73}' - | '\u{1B80}' .. '\u{1B82}' - | '\u{1BA1}' .. '\u{1BAD}' - | '\u{1BB0}' .. '\u{1BB9}' - | '\u{1BE6}' .. '\u{1BF3}' - | '\u{1C24}' .. '\u{1C37}' + | '\u{1B80}' .. '\u{1BF3}' + | '\u{1C00}' .. '\u{1C37}' | '\u{1C40}' .. '\u{1C49}' - | '\u{1C50}' .. '\u{1C59}' + | '\u{1C4D}' .. '\u{1C7D}' + | '\u{1C80}' .. '\u{1C8A}' + | '\u{1C90}' .. '\u{1CBA}' + | '\u{1CBD}' .. '\u{1CBF}' | '\u{1CD0}' .. '\u{1CD2}' - | '\u{1CD4}' .. '\u{1CE8}' - | '\u{1CED}' - | '\u{1CF4}' - | '\u{1CF7}' .. '\u{1CF9}' - | '\u{1DC0}' .. '\u{1DFF}' + | '\u{1CD4}' .. '\u{1CFA}' + | '\u{1D00}' .. '\u{1F15}' + | '\u{1F18}' .. '\u{1F1D}' + | '\u{1F20}' .. '\u{1F45}' + | '\u{1F48}' .. '\u{1F4D}' + | '\u{1F50}' .. '\u{1F57}' + | '\u{1F59}' + | '\u{1F5B}' + | '\u{1F5D}' + | '\u{1F5F}' .. '\u{1F7D}' + | '\u{1F80}' .. '\u{1FB4}' + | '\u{1FB6}' .. '\u{1FBC}' + | '\u{1FBE}' + | '\u{1FC2}' .. '\u{1FC4}' + | '\u{1FC6}' .. '\u{1FCC}' + | '\u{1FD0}' .. '\u{1FD3}' + | '\u{1FD6}' .. '\u{1FDB}' + | '\u{1FE0}' .. '\u{1FEC}' + | '\u{1FF2}' .. '\u{1FF4}' + | '\u{1FF6}' .. '\u{1FFC}' | '\u{200C}' .. '\u{200D}' | '\u{203F}' .. '\u{2040}' | '\u{2054}' + | '\u{2071}' + | '\u{207F}' + | '\u{2090}' .. '\u{209C}' | '\u{20D0}' .. '\u{20DC}' | '\u{20E1}' | '\u{20E5}' .. '\u{20F0}' - | '\u{2CEF}' .. '\u{2CF1}' - | '\u{2D7F}' + | '\u{2102}' + | '\u{2107}' + | '\u{210A}' .. '\u{2113}' + | '\u{2115}' + | '\u{2118}' .. '\u{211D}' + | '\u{2124}' + | '\u{2126}' + | '\u{2128}' + | '\u{212A}' .. '\u{2139}' + | '\u{213C}' .. '\u{213F}' + | '\u{2145}' .. '\u{2149}' + | '\u{214E}' + | '\u{2160}' .. '\u{2188}' + | '\u{2C00}' .. '\u{2CE4}' + | '\u{2CEB}' .. '\u{2CF3}' + | '\u{2D00}' .. '\u{2D25}' + | '\u{2D27}' + | '\u{2D2D}' + | '\u{2D30}' .. '\u{2D67}' + | '\u{2D6F}' + | '\u{2D7F}' .. '\u{2D96}' + | '\u{2DA0}' .. '\u{2DA6}' + | '\u{2DA8}' .. '\u{2DAE}' + | '\u{2DB0}' .. '\u{2DB6}' + | '\u{2DB8}' .. '\u{2DBE}' + | '\u{2DC0}' .. '\u{2DC6}' + | '\u{2DC8}' .. '\u{2DCE}' + | '\u{2DD0}' .. '\u{2DD6}' + | '\u{2DD8}' .. '\u{2DDE}' | '\u{2DE0}' .. '\u{2DFF}' - | '\u{302A}' .. '\u{302F}' + | '\u{3005}' .. '\u{3007}' + | '\u{3021}' .. '\u{302F}' + | '\u{3031}' .. '\u{3035}' + | '\u{3038}' .. '\u{303C}' + | '\u{3041}' .. '\u{3096}' | '\u{3099}' .. '\u{309A}' - | '\u{30FB}' - | '\u{A620}' .. '\u{A629}' - | '\u{A66F}' + | '\u{309D}' .. '\u{309F}' + | '\u{30A1}' .. '\u{30FF}' + | '\u{3105}' .. '\u{312F}' + | '\u{3131}' .. '\u{318E}' + | '\u{31A0}' .. '\u{31BF}' + | '\u{31F0}' .. '\u{31FF}' + | '\u{3400}' .. '\u{4DBF}' + | '\u{4E00}' .. '\u{A48C}' + | '\u{A4D0}' .. '\u{A4FD}' + | '\u{A500}' .. '\u{A60C}' + | '\u{A610}' .. '\u{A62B}' + | '\u{A640}' .. '\u{A66F}' | '\u{A674}' .. '\u{A67D}' - | '\u{A69E}' .. '\u{A69F}' - | '\u{A6F0}' .. '\u{A6F1}' - | '\u{A802}' - | '\u{A806}' - | '\u{A80B}' - | '\u{A823}' .. '\u{A827}' + | '\u{A67F}' .. '\u{A6F1}' + | '\u{A717}' .. '\u{A71F}' + | '\u{A722}' .. '\u{A788}' + | '\u{A78B}' .. '\u{A7CD}' + | '\u{A7D0}' .. '\u{A7D1}' + | '\u{A7D3}' + | '\u{A7D5}' .. '\u{A7DC}' + | '\u{A7F2}' .. '\u{A827}' | '\u{A82C}' - | '\u{A880}' .. '\u{A881}' - | '\u{A8B4}' .. '\u{A8C5}' + | '\u{A840}' .. '\u{A873}' + | '\u{A880}' .. '\u{A8C5}' | '\u{A8D0}' .. '\u{A8D9}' - | '\u{A8E0}' .. '\u{A8F1}' - | '\u{A8FF}' .. '\u{A909}' - | '\u{A926}' .. '\u{A92D}' - | '\u{A947}' .. '\u{A953}' - | '\u{A980}' .. '\u{A983}' - | '\u{A9B3}' .. '\u{A9C0}' - | '\u{A9D0}' .. '\u{A9D9}' - | '\u{A9E5}' - | '\u{A9F0}' .. '\u{A9F9}' - | '\u{AA29}' .. '\u{AA36}' - | '\u{AA43}' - | '\u{AA4C}' .. '\u{AA4D}' + | '\u{A8E0}' .. '\u{A8F7}' + | '\u{A8FB}' + | '\u{A8FD}' .. '\u{A92D}' + | '\u{A930}' .. '\u{A953}' + | '\u{A960}' .. '\u{A97C}' + | '\u{A980}' .. '\u{A9C0}' + | '\u{A9CF}' .. '\u{A9D9}' + | '\u{A9E0}' .. '\u{A9FE}' + | '\u{AA00}' .. '\u{AA36}' + | '\u{AA40}' .. '\u{AA4D}' | '\u{AA50}' .. '\u{AA59}' - | '\u{AA7B}' .. '\u{AA7D}' - | '\u{AAB0}' - | '\u{AAB2}' .. '\u{AAB4}' - | '\u{AAB7}' .. '\u{AAB8}' - | '\u{AABE}' .. '\u{AABF}' - | '\u{AAC1}' - | '\u{AAEB}' .. '\u{AAEF}' - | '\u{AAF5}' .. '\u{AAF6}' - | '\u{ABE3}' .. '\u{ABEA}' + | '\u{AA60}' .. '\u{AA76}' + | '\u{AA7A}' .. '\u{AAC2}' + | '\u{AADB}' .. '\u{AADD}' + | '\u{AAE0}' .. '\u{AAEF}' + | '\u{AAF2}' .. '\u{AAF6}' + | '\u{AB01}' .. '\u{AB06}' + | '\u{AB09}' .. '\u{AB0E}' + | '\u{AB11}' .. '\u{AB16}' + | '\u{AB20}' .. '\u{AB26}' + | '\u{AB28}' .. '\u{AB2E}' + | '\u{AB30}' .. '\u{AB5A}' + | '\u{AB5C}' .. '\u{AB69}' + | '\u{AB70}' .. '\u{ABEA}' | '\u{ABEC}' .. '\u{ABED}' | '\u{ABF0}' .. '\u{ABF9}' - | '\u{FB1E}' + | '\u{AC00}' .. '\u{D7A3}' + | '\u{D7B0}' .. '\u{D7C6}' + | '\u{D7CB}' .. '\u{D7FB}' + | '\u{F900}' .. '\u{FA6D}' + | '\u{FA70}' .. '\u{FAD9}' + | '\u{FB00}' .. '\u{FB06}' + | '\u{FB13}' .. '\u{FB17}' + | '\u{FB1D}' .. '\u{FB28}' + | '\u{FB2A}' .. '\u{FB36}' + | '\u{FB38}' .. '\u{FB3C}' + | '\u{FB3E}' + | '\u{FB40}' .. '\u{FB41}' + | '\u{FB43}' .. '\u{FB44}' + | '\u{FB46}' .. '\u{FBB1}' + | '\u{FBD3}' .. '\u{FC5D}' + | '\u{FC64}' .. '\u{FD3D}' + | '\u{FD50}' .. '\u{FD8F}' + | '\u{FD92}' .. '\u{FDC7}' + | '\u{FDF0}' .. '\u{FDF9}' | '\u{FE00}' .. '\u{FE0F}' | '\u{FE20}' .. '\u{FE2F}' | '\u{FE33}' .. '\u{FE34}' | '\u{FE4D}' .. '\u{FE4F}' + | '\u{FE71}' + | '\u{FE73}' + | '\u{FE77}' + | '\u{FE79}' + | '\u{FE7B}' + | '\u{FE7D}' + | '\u{FE7F}' .. '\u{FEFC}' | '\u{FF10}' .. '\u{FF19}' + | '\u{FF21}' .. '\u{FF3A}' | '\u{FF3F}' - | '\u{FF65}' - | '\u{FF9E}' .. '\u{FF9F}' + | '\u{FF41}' .. '\u{FF5A}' + | '\u{FF65}' .. '\u{FFBE}' + | '\u{FFC2}' .. '\u{FFC7}' + | '\u{FFCA}' .. '\u{FFCF}' + | '\u{FFD2}' .. '\u{FFD7}' + | '\u{FFDA}' .. '\u{FFDC}' + | '\u{10000}' .. '\u{1000B}' + | '\u{1000D}' .. '\u{10026}' + | '\u{10028}' .. '\u{1003A}' + | '\u{1003C}' .. '\u{1003D}' + | '\u{1003F}' .. '\u{1004D}' + | '\u{10050}' .. '\u{1005D}' + | '\u{10080}' .. '\u{100FA}' + | '\u{10140}' .. '\u{10174}' | '\u{101FD}' + | '\u{10280}' .. '\u{1029C}' + | '\u{102A0}' .. '\u{102D0}' | '\u{102E0}' - | '\u{10376}' .. '\u{1037A}' + | '\u{10300}' .. '\u{1031F}' + | '\u{1032D}' .. '\u{1034A}' + | '\u{10350}' .. '\u{1037A}' + | '\u{10380}' .. '\u{1039D}' + | '\u{103A0}' .. '\u{103C3}' + | '\u{103C8}' .. '\u{103CF}' + | '\u{103D1}' .. '\u{103D5}' + | '\u{10400}' .. '\u{1049D}' | '\u{104A0}' .. '\u{104A9}' - | '\u{10A01}' .. '\u{10A03}' + | '\u{104B0}' .. '\u{104D3}' + | '\u{104D8}' .. '\u{104FB}' + | '\u{10500}' .. '\u{10527}' + | '\u{10530}' .. '\u{10563}' + | '\u{10570}' .. '\u{1057A}' + | '\u{1057C}' .. '\u{1058A}' + | '\u{1058C}' .. '\u{10592}' + | '\u{10594}' .. '\u{10595}' + | '\u{10597}' .. '\u{105A1}' + | '\u{105A3}' .. '\u{105B1}' + | '\u{105B3}' .. '\u{105B9}' + | '\u{105BB}' .. '\u{105BC}' + | '\u{105C0}' .. '\u{105F3}' + | '\u{10600}' .. '\u{10736}' + | '\u{10740}' .. '\u{10755}' + | '\u{10760}' .. '\u{10767}' + | '\u{10780}' .. '\u{10785}' + | '\u{10787}' .. '\u{107B0}' + | '\u{107B2}' .. '\u{107BA}' + | '\u{10800}' .. '\u{10805}' + | '\u{10808}' + | '\u{1080A}' .. '\u{10835}' + | '\u{10837}' .. '\u{10838}' + | '\u{1083C}' + | '\u{1083F}' .. '\u{10855}' + | '\u{10860}' .. '\u{10876}' + | '\u{10880}' .. '\u{1089E}' + | '\u{108E0}' .. '\u{108F2}' + | '\u{108F4}' .. '\u{108F5}' + | '\u{10900}' .. '\u{10915}' + | '\u{10920}' .. '\u{10939}' + | '\u{10980}' .. '\u{109B7}' + | '\u{109BE}' .. '\u{109BF}' + | '\u{10A00}' .. '\u{10A03}' | '\u{10A05}' .. '\u{10A06}' - | '\u{10A0C}' .. '\u{10A0F}' + | '\u{10A0C}' .. '\u{10A13}' + | '\u{10A15}' .. '\u{10A17}' + | '\u{10A19}' .. '\u{10A35}' | '\u{10A38}' .. '\u{10A3A}' | '\u{10A3F}' - | '\u{10AE5}' .. '\u{10AE6}' - | '\u{10D24}' .. '\u{10D27}' + | '\u{10A60}' .. '\u{10A7C}' + | '\u{10A80}' .. '\u{10A9C}' + | '\u{10AC0}' .. '\u{10AC7}' + | '\u{10AC9}' .. '\u{10AE6}' + | '\u{10B00}' .. '\u{10B35}' + | '\u{10B40}' .. '\u{10B55}' + | '\u{10B60}' .. '\u{10B72}' + | '\u{10B80}' .. '\u{10B91}' + | '\u{10C00}' .. '\u{10C48}' + | '\u{10C80}' .. '\u{10CB2}' + | '\u{10CC0}' .. '\u{10CF2}' + | '\u{10D00}' .. '\u{10D27}' | '\u{10D30}' .. '\u{10D39}' - | '\u{10D40}' .. '\u{10D49}' + | '\u{10D40}' .. '\u{10D65}' | '\u{10D69}' .. '\u{10D6D}' + | '\u{10D6F}' .. '\u{10D85}' + | '\u{10E80}' .. '\u{10EA9}' | '\u{10EAB}' .. '\u{10EAC}' - | '\u{10EFC}' .. '\u{10EFF}' - | '\u{10F46}' .. '\u{10F50}' - | '\u{10F82}' .. '\u{10F85}' - | '\u{11000}' .. '\u{11002}' - | '\u{11038}' .. '\u{11046}' - | '\u{11066}' .. '\u{11070}' - | '\u{11073}' .. '\u{11074}' - | '\u{1107F}' .. '\u{11082}' - | '\u{110B0}' .. '\u{110BA}' + | '\u{10EB0}' .. '\u{10EB1}' + | '\u{10EC2}' .. '\u{10EC4}' + | '\u{10EFC}' .. '\u{10F1C}' + | '\u{10F27}' + | '\u{10F30}' .. '\u{10F50}' + | '\u{10F70}' .. '\u{10F85}' + | '\u{10FB0}' .. '\u{10FC4}' + | '\u{10FE0}' .. '\u{10FF6}' + | '\u{11000}' .. '\u{11046}' + | '\u{11066}' .. '\u{11075}' + | '\u{1107F}' .. '\u{110BA}' | '\u{110C2}' + | '\u{110D0}' .. '\u{110E8}' | '\u{110F0}' .. '\u{110F9}' - | '\u{11100}' .. '\u{11102}' - | '\u{11127}' .. '\u{11134}' + | '\u{11100}' .. '\u{11134}' | '\u{11136}' .. '\u{1113F}' - | '\u{11145}' .. '\u{11146}' - | '\u{11173}' - | '\u{11180}' .. '\u{11182}' - | '\u{111B3}' .. '\u{111C0}' + | '\u{11144}' .. '\u{11147}' + | '\u{11150}' .. '\u{11173}' + | '\u{11176}' + | '\u{11180}' .. '\u{111C4}' | '\u{111C9}' .. '\u{111CC}' - | '\u{111CE}' .. '\u{111D9}' - | '\u{1122C}' .. '\u{11237}' - | '\u{1123E}' - | '\u{11241}' - | '\u{112DF}' .. '\u{112EA}' + | '\u{111CE}' .. '\u{111DA}' + | '\u{111DC}' + | '\u{11200}' .. '\u{11211}' + | '\u{11213}' .. '\u{11237}' + | '\u{1123E}' .. '\u{11241}' + | '\u{11280}' .. '\u{11286}' + | '\u{11288}' + | '\u{1128A}' .. '\u{1128D}' + | '\u{1128F}' .. '\u{1129D}' + | '\u{1129F}' .. '\u{112A8}' + | '\u{112B0}' .. '\u{112EA}' | '\u{112F0}' .. '\u{112F9}' | '\u{11300}' .. '\u{11303}' - | '\u{1133B}' .. '\u{1133C}' - | '\u{1133E}' .. '\u{11344}' + | '\u{11305}' .. '\u{1130C}' + | '\u{1130F}' .. '\u{11310}' + | '\u{11313}' .. '\u{11328}' + | '\u{1132A}' .. '\u{11330}' + | '\u{11332}' .. '\u{11333}' + | '\u{11335}' .. '\u{11339}' + | '\u{1133B}' .. '\u{11344}' | '\u{11347}' .. '\u{11348}' | '\u{1134B}' .. '\u{1134D}' + | '\u{11350}' | '\u{11357}' - | '\u{11362}' .. '\u{11363}' + | '\u{1135D}' .. '\u{11363}' | '\u{11366}' .. '\u{1136C}' | '\u{11370}' .. '\u{11374}' - | '\u{113B8}' .. '\u{113C0}' + | '\u{11380}' .. '\u{11389}' + | '\u{1138B}' + | '\u{1138E}' + | '\u{11390}' .. '\u{113B5}' + | '\u{113B7}' .. '\u{113C0}' | '\u{113C2}' | '\u{113C5}' | '\u{113C7}' .. '\u{113CA}' - | '\u{113CC}' .. '\u{113D0}' - | '\u{113D2}' + | '\u{113CC}' .. '\u{113D3}' | '\u{113E1}' .. '\u{113E2}' - | '\u{11435}' .. '\u{11446}' + | '\u{11400}' .. '\u{1144A}' | '\u{11450}' .. '\u{11459}' - | '\u{1145E}' - | '\u{114B0}' .. '\u{114C3}' + | '\u{1145E}' .. '\u{11461}' + | '\u{11480}' .. '\u{114C5}' + | '\u{114C7}' | '\u{114D0}' .. '\u{114D9}' - | '\u{115AF}' .. '\u{115B5}' + | '\u{11580}' .. '\u{115B5}' | '\u{115B8}' .. '\u{115C0}' - | '\u{115DC}' .. '\u{115DD}' - | '\u{11630}' .. '\u{11640}' + | '\u{115D8}' .. '\u{115DD}' + | '\u{11600}' .. '\u{11640}' + | '\u{11644}' | '\u{11650}' .. '\u{11659}' - | '\u{116AB}' .. '\u{116B7}' + | '\u{11680}' .. '\u{116B8}' | '\u{116C0}' .. '\u{116C9}' | '\u{116D0}' .. '\u{116E3}' + | '\u{11700}' .. '\u{1171A}' | '\u{1171D}' .. '\u{1172B}' | '\u{11730}' .. '\u{11739}' - | '\u{1182C}' .. '\u{1183A}' - | '\u{118E0}' .. '\u{118E9}' - | '\u{11930}' .. '\u{11935}' + | '\u{11740}' .. '\u{11746}' + | '\u{11800}' .. '\u{1183A}' + | '\u{118A0}' .. '\u{118E9}' + | '\u{118FF}' .. '\u{11906}' + | '\u{11909}' + | '\u{1190C}' .. '\u{11913}' + | '\u{11915}' .. '\u{11916}' + | '\u{11918}' .. '\u{11935}' | '\u{11937}' .. '\u{11938}' - | '\u{1193B}' .. '\u{1193E}' - | '\u{11940}' - | '\u{11942}' .. '\u{11943}' + | '\u{1193B}' .. '\u{11943}' | '\u{11950}' .. '\u{11959}' - | '\u{119D1}' .. '\u{119D7}' - | '\u{119DA}' .. '\u{119E0}' - | '\u{119E4}' - | '\u{11A01}' .. '\u{11A0A}' - | '\u{11A33}' .. '\u{11A39}' - | '\u{11A3B}' .. '\u{11A3E}' + | '\u{119A0}' .. '\u{119A7}' + | '\u{119AA}' .. '\u{119D7}' + | '\u{119DA}' .. '\u{119E1}' + | '\u{119E3}' .. '\u{119E4}' + | '\u{11A00}' .. '\u{11A3E}' | '\u{11A47}' - | '\u{11A51}' .. '\u{11A5B}' - | '\u{11A8A}' .. '\u{11A99}' + | '\u{11A50}' .. '\u{11A99}' + | '\u{11A9D}' + | '\u{11AB0}' .. '\u{11AF8}' + | '\u{11BC0}' .. '\u{11BE0}' | '\u{11BF0}' .. '\u{11BF9}' - | '\u{11C2F}' .. '\u{11C36}' - | '\u{11C38}' .. '\u{11C3F}' + | '\u{11C00}' .. '\u{11C08}' + | '\u{11C0A}' .. '\u{11C36}' + | '\u{11C38}' .. '\u{11C40}' | '\u{11C50}' .. '\u{11C59}' + | '\u{11C72}' .. '\u{11C8F}' | '\u{11C92}' .. '\u{11CA7}' | '\u{11CA9}' .. '\u{11CB6}' - | '\u{11D31}' .. '\u{11D36}' + | '\u{11D00}' .. '\u{11D06}' + | '\u{11D08}' .. '\u{11D09}' + | '\u{11D0B}' .. '\u{11D36}' | '\u{11D3A}' | '\u{11D3C}' .. '\u{11D3D}' - | '\u{11D3F}' .. '\u{11D45}' - | '\u{11D47}' + | '\u{11D3F}' .. '\u{11D47}' | '\u{11D50}' .. '\u{11D59}' - | '\u{11D8A}' .. '\u{11D8E}' + | '\u{11D60}' .. '\u{11D65}' + | '\u{11D67}' .. '\u{11D68}' + | '\u{11D6A}' .. '\u{11D8E}' | '\u{11D90}' .. '\u{11D91}' - | '\u{11D93}' .. '\u{11D97}' + | '\u{11D93}' .. '\u{11D98}' | '\u{11DA0}' .. '\u{11DA9}' - | '\u{11EF3}' .. '\u{11EF6}' - | '\u{11F00}' .. '\u{11F01}' - | '\u{11F03}' - | '\u{11F34}' .. '\u{11F3A}' + | '\u{11EE0}' .. '\u{11EF6}' + | '\u{11F00}' .. '\u{11F10}' + | '\u{11F12}' .. '\u{11F3A}' | '\u{11F3E}' .. '\u{11F42}' | '\u{11F50}' .. '\u{11F5A}' - | '\u{13440}' - | '\u{13447}' .. '\u{13455}' - | '\u{1611E}' .. '\u{16139}' + | '\u{11FB0}' + | '\u{12000}' .. '\u{12399}' + | '\u{12400}' .. '\u{1246E}' + | '\u{12480}' .. '\u{12543}' + | '\u{12F90}' .. '\u{12FF0}' + | '\u{13000}' .. '\u{1342F}' + | '\u{13440}' .. '\u{13455}' + | '\u{13460}' .. '\u{143FA}' + | '\u{14400}' .. '\u{14646}' + | '\u{16100}' .. '\u{16139}' + | '\u{16800}' .. '\u{16A38}' + | '\u{16A40}' .. '\u{16A5E}' | '\u{16A60}' .. '\u{16A69}' + | '\u{16A70}' .. '\u{16ABE}' | '\u{16AC0}' .. '\u{16AC9}' + | '\u{16AD0}' .. '\u{16AED}' | '\u{16AF0}' .. '\u{16AF4}' - | '\u{16B30}' .. '\u{16B36}' + | '\u{16B00}' .. '\u{16B36}' + | '\u{16B40}' .. '\u{16B43}' | '\u{16B50}' .. '\u{16B59}' + | '\u{16B63}' .. '\u{16B77}' + | '\u{16B7D}' .. '\u{16B8F}' + | '\u{16D40}' .. '\u{16D6C}' | '\u{16D70}' .. '\u{16D79}' - | '\u{16F4F}' - | '\u{16F51}' .. '\u{16F87}' - | '\u{16F8F}' .. '\u{16F92}' - | '\u{16FE4}' + | '\u{16E40}' .. '\u{16E7F}' + | '\u{16F00}' .. '\u{16F4A}' + | '\u{16F4F}' .. '\u{16F87}' + | '\u{16F8F}' .. '\u{16F9F}' + | '\u{16FE0}' .. '\u{16FE1}' + | '\u{16FE3}' .. '\u{16FE4}' | '\u{16FF0}' .. '\u{16FF1}' + | '\u{17000}' .. '\u{187F7}' + | '\u{18800}' .. '\u{18CD5}' + | '\u{18CFF}' .. '\u{18D08}' + | '\u{1AFF0}' .. '\u{1AFF3}' + | '\u{1AFF5}' .. '\u{1AFFB}' + | '\u{1AFFD}' .. '\u{1AFFE}' + | '\u{1B000}' .. '\u{1B122}' + | '\u{1B132}' + | '\u{1B150}' .. '\u{1B152}' + | '\u{1B155}' + | '\u{1B164}' .. '\u{1B167}' + | '\u{1B170}' .. '\u{1B2FB}' + | '\u{1BC00}' .. '\u{1BC6A}' + | '\u{1BC70}' .. '\u{1BC7C}' + | '\u{1BC80}' .. '\u{1BC88}' + | '\u{1BC90}' .. '\u{1BC99}' | '\u{1BC9D}' .. '\u{1BC9E}' | '\u{1CCF0}' .. '\u{1CCF9}' | '\u{1CF00}' .. '\u{1CF2D}' @@ -945,6 +1271,36 @@ fragment ID_CONTINUE // for Python 3.14.2 | '\u{1D185}' .. '\u{1D18B}' | '\u{1D1AA}' .. '\u{1D1AD}' | '\u{1D242}' .. '\u{1D244}' + | '\u{1D400}' .. '\u{1D454}' + | '\u{1D456}' .. '\u{1D49C}' + | '\u{1D49E}' .. '\u{1D49F}' + | '\u{1D4A2}' + | '\u{1D4A5}' .. '\u{1D4A6}' + | '\u{1D4A9}' .. '\u{1D4AC}' + | '\u{1D4AE}' .. '\u{1D4B9}' + | '\u{1D4BB}' + | '\u{1D4BD}' .. '\u{1D4C3}' + | '\u{1D4C5}' .. '\u{1D505}' + | '\u{1D507}' .. '\u{1D50A}' + | '\u{1D50D}' .. '\u{1D514}' + | '\u{1D516}' .. '\u{1D51C}' + | '\u{1D51E}' .. '\u{1D539}' + | '\u{1D53B}' .. '\u{1D53E}' + | '\u{1D540}' .. '\u{1D544}' + | '\u{1D546}' + | '\u{1D54A}' .. '\u{1D550}' + | '\u{1D552}' .. '\u{1D6A5}' + | '\u{1D6A8}' .. '\u{1D6C0}' + | '\u{1D6C2}' .. '\u{1D6DA}' + | '\u{1D6DC}' .. '\u{1D6FA}' + | '\u{1D6FC}' .. '\u{1D714}' + | '\u{1D716}' .. '\u{1D734}' + | '\u{1D736}' .. '\u{1D74E}' + | '\u{1D750}' .. '\u{1D76E}' + | '\u{1D770}' .. '\u{1D788}' + | '\u{1D78A}' .. '\u{1D7A8}' + | '\u{1D7AA}' .. '\u{1D7C2}' + | '\u{1D7C4}' .. '\u{1D7CB}' | '\u{1D7CE}' .. '\u{1D7FF}' | '\u{1DA00}' .. '\u{1DA36}' | '\u{1DA3B}' .. '\u{1DA6C}' @@ -952,23 +1308,74 @@ fragment ID_CONTINUE // for Python 3.14.2 | '\u{1DA84}' | '\u{1DA9B}' .. '\u{1DA9F}' | '\u{1DAA1}' .. '\u{1DAAF}' + | '\u{1DF00}' .. '\u{1DF1E}' + | '\u{1DF25}' .. '\u{1DF2A}' | '\u{1E000}' .. '\u{1E006}' | '\u{1E008}' .. '\u{1E018}' | '\u{1E01B}' .. '\u{1E021}' | '\u{1E023}' .. '\u{1E024}' | '\u{1E026}' .. '\u{1E02A}' + | '\u{1E030}' .. '\u{1E06D}' | '\u{1E08F}' - | '\u{1E130}' .. '\u{1E136}' + | '\u{1E100}' .. '\u{1E12C}' + | '\u{1E130}' .. '\u{1E13D}' | '\u{1E140}' .. '\u{1E149}' - | '\u{1E2AE}' - | '\u{1E2EC}' .. '\u{1E2F9}' - | '\u{1E4EC}' .. '\u{1E4F9}' - | '\u{1E5EE}' .. '\u{1E5EF}' - | '\u{1E5F1}' .. '\u{1E5FA}' + | '\u{1E14E}' + | '\u{1E290}' .. '\u{1E2AE}' + | '\u{1E2C0}' .. '\u{1E2F9}' + | '\u{1E4D0}' .. '\u{1E4F9}' + | '\u{1E5D0}' .. '\u{1E5FA}' + | '\u{1E7E0}' .. '\u{1E7E6}' + | '\u{1E7E8}' .. '\u{1E7EB}' + | '\u{1E7ED}' .. '\u{1E7EE}' + | '\u{1E7F0}' .. '\u{1E7FE}' + | '\u{1E800}' .. '\u{1E8C4}' | '\u{1E8D0}' .. '\u{1E8D6}' - | '\u{1E944}' .. '\u{1E94A}' + | '\u{1E900}' .. '\u{1E94B}' | '\u{1E950}' .. '\u{1E959}' + | '\u{1EE00}' .. '\u{1EE03}' + | '\u{1EE05}' .. '\u{1EE1F}' + | '\u{1EE21}' .. '\u{1EE22}' + | '\u{1EE24}' + | '\u{1EE27}' + | '\u{1EE29}' .. '\u{1EE32}' + | '\u{1EE34}' .. '\u{1EE37}' + | '\u{1EE39}' + | '\u{1EE3B}' + | '\u{1EE42}' + | '\u{1EE47}' + | '\u{1EE49}' + | '\u{1EE4B}' + | '\u{1EE4D}' .. '\u{1EE4F}' + | '\u{1EE51}' .. '\u{1EE52}' + | '\u{1EE54}' + | '\u{1EE57}' + | '\u{1EE59}' + | '\u{1EE5B}' + | '\u{1EE5D}' + | '\u{1EE5F}' + | '\u{1EE61}' .. '\u{1EE62}' + | '\u{1EE64}' + | '\u{1EE67}' .. '\u{1EE6A}' + | '\u{1EE6C}' .. '\u{1EE72}' + | '\u{1EE74}' .. '\u{1EE77}' + | '\u{1EE79}' .. '\u{1EE7C}' + | '\u{1EE7E}' + | '\u{1EE80}' .. '\u{1EE89}' + | '\u{1EE8B}' .. '\u{1EE9B}' + | '\u{1EEA1}' .. '\u{1EEA3}' + | '\u{1EEA5}' .. '\u{1EEA9}' + | '\u{1EEAB}' .. '\u{1EEBB}' | '\u{1FBF0}' .. '\u{1FBF9}' + | '\u{20000}' .. '\u{2A6DF}' + | '\u{2A700}' .. '\u{2B739}' + | '\u{2B740}' .. '\u{2B81D}' + | '\u{2B820}' .. '\u{2CEA1}' + | '\u{2CEB0}' .. '\u{2EBE0}' + | '\u{2EBF0}' .. '\u{2EE5D}' + | '\u{2F800}' .. '\u{2FA1D}' + | '\u{30000}' .. '\u{3134A}' + | '\u{31350}' .. '\u{323AF}' | '\u{E0100}' .. '\u{E01EF}' ; From d00b42114e3b0f31fbd0bb6439285c0d6eaa8fc5 Mon Sep 17 00:00:00 2001 From: Robert Einhorn Date: Sat, 27 Dec 2025 21:59:41 +0100 Subject: [PATCH 6/6] Update PythonLexer.g4 --- python/python3_14/PythonLexer.g4 | 833 ++++++++----------------------- 1 file changed, 213 insertions(+), 620 deletions(-) diff --git a/python/python3_14/PythonLexer.g4 b/python/python3_14/PythonLexer.g4 index 488dd0a919..a6ed067c3c 100644 --- a/python/python3_14/PythonLexer.g4 +++ b/python/python3_14/PythonLexer.g4 @@ -578,689 +578,363 @@ fragment IMAG_NUMBER : (FLOAT_NUMBER | DIGIT_PART) ('j' | 'J'); fragment ID_CONTINUE // for Python 3.14.2 : ID_START | '\u{0030}' .. '\u{0039}' - | '\u{0041}' .. '\u{005A}' - | '\u{005F}' - | '\u{0061}' .. '\u{007A}' - | '\u{00AA}' - | '\u{00B5}' | '\u{00B7}' - | '\u{00BA}' - | '\u{00C0}' .. '\u{00D6}' - | '\u{00D8}' .. '\u{00F6}' - | '\u{00F8}' .. '\u{02C1}' - | '\u{02C6}' .. '\u{02D1}' - | '\u{02E0}' .. '\u{02E4}' - | '\u{02EC}' - | '\u{02EE}' - | '\u{0300}' .. '\u{0374}' - | '\u{0376}' .. '\u{0377}' - | '\u{037B}' .. '\u{037D}' - | '\u{037F}' - | '\u{0386}' .. '\u{038A}' - | '\u{038C}' - | '\u{038E}' .. '\u{03A1}' - | '\u{03A3}' .. '\u{03F5}' - | '\u{03F7}' .. '\u{0481}' + | '\u{0300}' .. '\u{036F}' + | '\u{0387}' | '\u{0483}' .. '\u{0487}' - | '\u{048A}' .. '\u{052F}' - | '\u{0531}' .. '\u{0556}' - | '\u{0559}' - | '\u{0560}' .. '\u{0588}' | '\u{0591}' .. '\u{05BD}' | '\u{05BF}' | '\u{05C1}' .. '\u{05C2}' | '\u{05C4}' .. '\u{05C5}' | '\u{05C7}' - | '\u{05D0}' .. '\u{05EA}' - | '\u{05EF}' .. '\u{05F2}' | '\u{0610}' .. '\u{061A}' - | '\u{0620}' .. '\u{0669}' - | '\u{066E}' .. '\u{06D3}' - | '\u{06D5}' .. '\u{06DC}' - | '\u{06DF}' .. '\u{06E8}' - | '\u{06EA}' .. '\u{06FC}' - | '\u{06FF}' - | '\u{0710}' .. '\u{074A}' - | '\u{074D}' .. '\u{07B1}' - | '\u{07C0}' .. '\u{07F5}' - | '\u{07FA}' + | '\u{064B}' .. '\u{0669}' + | '\u{0670}' + | '\u{06D6}' .. '\u{06DC}' + | '\u{06DF}' .. '\u{06E4}' + | '\u{06E7}' .. '\u{06E8}' + | '\u{06EA}' .. '\u{06ED}' + | '\u{06F0}' .. '\u{06F9}' + | '\u{0711}' + | '\u{0730}' .. '\u{074A}' + | '\u{07A6}' .. '\u{07B0}' + | '\u{07C0}' .. '\u{07C9}' + | '\u{07EB}' .. '\u{07F3}' | '\u{07FD}' - | '\u{0800}' .. '\u{082D}' - | '\u{0840}' .. '\u{085B}' - | '\u{0860}' .. '\u{086A}' - | '\u{0870}' .. '\u{0887}' - | '\u{0889}' .. '\u{088E}' - | '\u{0897}' .. '\u{08E1}' - | '\u{08E3}' .. '\u{0963}' + | '\u{0816}' .. '\u{0819}' + | '\u{081B}' .. '\u{0823}' + | '\u{0825}' .. '\u{0827}' + | '\u{0829}' .. '\u{082D}' + | '\u{0859}' .. '\u{085B}' + | '\u{0897}' .. '\u{089F}' + | '\u{08CA}' .. '\u{08E1}' + | '\u{08E3}' .. '\u{0903}' + | '\u{093A}' .. '\u{093C}' + | '\u{093E}' .. '\u{094F}' + | '\u{0951}' .. '\u{0957}' + | '\u{0962}' .. '\u{0963}' | '\u{0966}' .. '\u{096F}' - | '\u{0971}' .. '\u{0983}' - | '\u{0985}' .. '\u{098C}' - | '\u{098F}' .. '\u{0990}' - | '\u{0993}' .. '\u{09A8}' - | '\u{09AA}' .. '\u{09B0}' - | '\u{09B2}' - | '\u{09B6}' .. '\u{09B9}' - | '\u{09BC}' .. '\u{09C4}' + | '\u{0981}' .. '\u{0983}' + | '\u{09BC}' + | '\u{09BE}' .. '\u{09C4}' | '\u{09C7}' .. '\u{09C8}' - | '\u{09CB}' .. '\u{09CE}' + | '\u{09CB}' .. '\u{09CD}' | '\u{09D7}' - | '\u{09DC}' .. '\u{09DD}' - | '\u{09DF}' .. '\u{09E3}' - | '\u{09E6}' .. '\u{09F1}' - | '\u{09FC}' + | '\u{09E2}' .. '\u{09E3}' + | '\u{09E6}' .. '\u{09EF}' | '\u{09FE}' | '\u{0A01}' .. '\u{0A03}' - | '\u{0A05}' .. '\u{0A0A}' - | '\u{0A0F}' .. '\u{0A10}' - | '\u{0A13}' .. '\u{0A28}' - | '\u{0A2A}' .. '\u{0A30}' - | '\u{0A32}' .. '\u{0A33}' - | '\u{0A35}' .. '\u{0A36}' - | '\u{0A38}' .. '\u{0A39}' | '\u{0A3C}' | '\u{0A3E}' .. '\u{0A42}' | '\u{0A47}' .. '\u{0A48}' | '\u{0A4B}' .. '\u{0A4D}' | '\u{0A51}' - | '\u{0A59}' .. '\u{0A5C}' - | '\u{0A5E}' - | '\u{0A66}' .. '\u{0A75}' + | '\u{0A66}' .. '\u{0A71}' + | '\u{0A75}' | '\u{0A81}' .. '\u{0A83}' - | '\u{0A85}' .. '\u{0A8D}' - | '\u{0A8F}' .. '\u{0A91}' - | '\u{0A93}' .. '\u{0AA8}' - | '\u{0AAA}' .. '\u{0AB0}' - | '\u{0AB2}' .. '\u{0AB3}' - | '\u{0AB5}' .. '\u{0AB9}' - | '\u{0ABC}' .. '\u{0AC5}' + | '\u{0ABC}' + | '\u{0ABE}' .. '\u{0AC5}' | '\u{0AC7}' .. '\u{0AC9}' | '\u{0ACB}' .. '\u{0ACD}' - | '\u{0AD0}' - | '\u{0AE0}' .. '\u{0AE3}' + | '\u{0AE2}' .. '\u{0AE3}' | '\u{0AE6}' .. '\u{0AEF}' - | '\u{0AF9}' .. '\u{0AFF}' + | '\u{0AFA}' .. '\u{0AFF}' | '\u{0B01}' .. '\u{0B03}' - | '\u{0B05}' .. '\u{0B0C}' - | '\u{0B0F}' .. '\u{0B10}' - | '\u{0B13}' .. '\u{0B28}' - | '\u{0B2A}' .. '\u{0B30}' - | '\u{0B32}' .. '\u{0B33}' - | '\u{0B35}' .. '\u{0B39}' - | '\u{0B3C}' .. '\u{0B44}' + | '\u{0B3C}' + | '\u{0B3E}' .. '\u{0B44}' | '\u{0B47}' .. '\u{0B48}' | '\u{0B4B}' .. '\u{0B4D}' | '\u{0B55}' .. '\u{0B57}' - | '\u{0B5C}' .. '\u{0B5D}' - | '\u{0B5F}' .. '\u{0B63}' + | '\u{0B62}' .. '\u{0B63}' | '\u{0B66}' .. '\u{0B6F}' - | '\u{0B71}' - | '\u{0B82}' .. '\u{0B83}' - | '\u{0B85}' .. '\u{0B8A}' - | '\u{0B8E}' .. '\u{0B90}' - | '\u{0B92}' .. '\u{0B95}' - | '\u{0B99}' .. '\u{0B9A}' - | '\u{0B9C}' - | '\u{0B9E}' .. '\u{0B9F}' - | '\u{0BA3}' .. '\u{0BA4}' - | '\u{0BA8}' .. '\u{0BAA}' - | '\u{0BAE}' .. '\u{0BB9}' + | '\u{0B82}' | '\u{0BBE}' .. '\u{0BC2}' | '\u{0BC6}' .. '\u{0BC8}' | '\u{0BCA}' .. '\u{0BCD}' - | '\u{0BD0}' | '\u{0BD7}' | '\u{0BE6}' .. '\u{0BEF}' - | '\u{0C00}' .. '\u{0C0C}' - | '\u{0C0E}' .. '\u{0C10}' - | '\u{0C12}' .. '\u{0C28}' - | '\u{0C2A}' .. '\u{0C39}' - | '\u{0C3C}' .. '\u{0C44}' + | '\u{0C00}' .. '\u{0C04}' + | '\u{0C3C}' + | '\u{0C3E}' .. '\u{0C44}' | '\u{0C46}' .. '\u{0C48}' | '\u{0C4A}' .. '\u{0C4D}' | '\u{0C55}' .. '\u{0C56}' - | '\u{0C58}' .. '\u{0C5A}' - | '\u{0C5D}' - | '\u{0C60}' .. '\u{0C63}' + | '\u{0C62}' .. '\u{0C63}' | '\u{0C66}' .. '\u{0C6F}' - | '\u{0C80}' .. '\u{0C83}' - | '\u{0C85}' .. '\u{0C8C}' - | '\u{0C8E}' .. '\u{0C90}' - | '\u{0C92}' .. '\u{0CA8}' - | '\u{0CAA}' .. '\u{0CB3}' - | '\u{0CB5}' .. '\u{0CB9}' - | '\u{0CBC}' .. '\u{0CC4}' + | '\u{0C81}' .. '\u{0C83}' + | '\u{0CBC}' + | '\u{0CBE}' .. '\u{0CC4}' | '\u{0CC6}' .. '\u{0CC8}' | '\u{0CCA}' .. '\u{0CCD}' | '\u{0CD5}' .. '\u{0CD6}' - | '\u{0CDD}' .. '\u{0CDE}' - | '\u{0CE0}' .. '\u{0CE3}' + | '\u{0CE2}' .. '\u{0CE3}' | '\u{0CE6}' .. '\u{0CEF}' - | '\u{0CF1}' .. '\u{0CF3}' - | '\u{0D00}' .. '\u{0D0C}' - | '\u{0D0E}' .. '\u{0D10}' - | '\u{0D12}' .. '\u{0D44}' + | '\u{0CF3}' + | '\u{0D00}' .. '\u{0D03}' + | '\u{0D3B}' .. '\u{0D3C}' + | '\u{0D3E}' .. '\u{0D44}' | '\u{0D46}' .. '\u{0D48}' - | '\u{0D4A}' .. '\u{0D4E}' - | '\u{0D54}' .. '\u{0D57}' - | '\u{0D5F}' .. '\u{0D63}' + | '\u{0D4A}' .. '\u{0D4D}' + | '\u{0D57}' + | '\u{0D62}' .. '\u{0D63}' | '\u{0D66}' .. '\u{0D6F}' - | '\u{0D7A}' .. '\u{0D7F}' | '\u{0D81}' .. '\u{0D83}' - | '\u{0D85}' .. '\u{0D96}' - | '\u{0D9A}' .. '\u{0DB1}' - | '\u{0DB3}' .. '\u{0DBB}' - | '\u{0DBD}' - | '\u{0DC0}' .. '\u{0DC6}' | '\u{0DCA}' | '\u{0DCF}' .. '\u{0DD4}' | '\u{0DD6}' | '\u{0DD8}' .. '\u{0DDF}' | '\u{0DE6}' .. '\u{0DEF}' | '\u{0DF2}' .. '\u{0DF3}' - | '\u{0E01}' .. '\u{0E3A}' - | '\u{0E40}' .. '\u{0E4E}' + | '\u{0E31}' + | '\u{0E33}' .. '\u{0E3A}' + | '\u{0E47}' .. '\u{0E4E}' | '\u{0E50}' .. '\u{0E59}' - | '\u{0E81}' .. '\u{0E82}' - | '\u{0E84}' - | '\u{0E86}' .. '\u{0E8A}' - | '\u{0E8C}' .. '\u{0EA3}' - | '\u{0EA5}' - | '\u{0EA7}' .. '\u{0EBD}' - | '\u{0EC0}' .. '\u{0EC4}' - | '\u{0EC6}' + | '\u{0EB1}' + | '\u{0EB3}' .. '\u{0EBC}' | '\u{0EC8}' .. '\u{0ECE}' | '\u{0ED0}' .. '\u{0ED9}' - | '\u{0EDC}' .. '\u{0EDF}' - | '\u{0F00}' | '\u{0F18}' .. '\u{0F19}' | '\u{0F20}' .. '\u{0F29}' | '\u{0F35}' | '\u{0F37}' | '\u{0F39}' - | '\u{0F3E}' .. '\u{0F47}' - | '\u{0F49}' .. '\u{0F6C}' + | '\u{0F3E}' .. '\u{0F3F}' | '\u{0F71}' .. '\u{0F84}' - | '\u{0F86}' .. '\u{0F97}' + | '\u{0F86}' .. '\u{0F87}' + | '\u{0F8D}' .. '\u{0F97}' | '\u{0F99}' .. '\u{0FBC}' | '\u{0FC6}' - | '\u{1000}' .. '\u{1049}' - | '\u{1050}' .. '\u{109D}' - | '\u{10A0}' .. '\u{10C5}' - | '\u{10C7}' - | '\u{10CD}' - | '\u{10D0}' .. '\u{10FA}' - | '\u{10FC}' .. '\u{1248}' - | '\u{124A}' .. '\u{124D}' - | '\u{1250}' .. '\u{1256}' - | '\u{1258}' - | '\u{125A}' .. '\u{125D}' - | '\u{1260}' .. '\u{1288}' - | '\u{128A}' .. '\u{128D}' - | '\u{1290}' .. '\u{12B0}' - | '\u{12B2}' .. '\u{12B5}' - | '\u{12B8}' .. '\u{12BE}' - | '\u{12C0}' - | '\u{12C2}' .. '\u{12C5}' - | '\u{12C8}' .. '\u{12D6}' - | '\u{12D8}' .. '\u{1310}' - | '\u{1312}' .. '\u{1315}' - | '\u{1318}' .. '\u{135A}' + | '\u{102B}' .. '\u{103E}' + | '\u{1040}' .. '\u{1049}' + | '\u{1056}' .. '\u{1059}' + | '\u{105E}' .. '\u{1060}' + | '\u{1062}' .. '\u{1064}' + | '\u{1067}' .. '\u{106D}' + | '\u{1071}' .. '\u{1074}' + | '\u{1082}' .. '\u{108D}' + | '\u{108F}' .. '\u{109D}' | '\u{135D}' .. '\u{135F}' | '\u{1369}' .. '\u{1371}' - | '\u{1380}' .. '\u{138F}' - | '\u{13A0}' .. '\u{13F5}' - | '\u{13F8}' .. '\u{13FD}' - | '\u{1401}' .. '\u{166C}' - | '\u{166F}' .. '\u{167F}' - | '\u{1681}' .. '\u{169A}' - | '\u{16A0}' .. '\u{16EA}' - | '\u{16EE}' .. '\u{16F8}' - | '\u{1700}' .. '\u{1715}' - | '\u{171F}' .. '\u{1734}' - | '\u{1740}' .. '\u{1753}' - | '\u{1760}' .. '\u{176C}' - | '\u{176E}' .. '\u{1770}' + | '\u{1712}' .. '\u{1715}' + | '\u{1732}' .. '\u{1734}' + | '\u{1752}' .. '\u{1753}' | '\u{1772}' .. '\u{1773}' - | '\u{1780}' .. '\u{17D3}' - | '\u{17D7}' - | '\u{17DC}' .. '\u{17DD}' + | '\u{17B4}' .. '\u{17D3}' + | '\u{17DD}' | '\u{17E0}' .. '\u{17E9}' | '\u{180B}' .. '\u{180D}' | '\u{180F}' .. '\u{1819}' - | '\u{1820}' .. '\u{1878}' - | '\u{1880}' .. '\u{18AA}' - | '\u{18B0}' .. '\u{18F5}' - | '\u{1900}' .. '\u{191E}' + | '\u{18A9}' | '\u{1920}' .. '\u{192B}' | '\u{1930}' .. '\u{193B}' - | '\u{1946}' .. '\u{196D}' - | '\u{1970}' .. '\u{1974}' - | '\u{1980}' .. '\u{19AB}' - | '\u{19B0}' .. '\u{19C9}' + | '\u{1946}' .. '\u{194F}' | '\u{19D0}' .. '\u{19DA}' - | '\u{1A00}' .. '\u{1A1B}' - | '\u{1A20}' .. '\u{1A5E}' + | '\u{1A17}' .. '\u{1A1B}' + | '\u{1A55}' .. '\u{1A5E}' | '\u{1A60}' .. '\u{1A7C}' | '\u{1A7F}' .. '\u{1A89}' | '\u{1A90}' .. '\u{1A99}' - | '\u{1AA7}' | '\u{1AB0}' .. '\u{1ABD}' | '\u{1ABF}' .. '\u{1ACE}' - | '\u{1B00}' .. '\u{1B4C}' + | '\u{1B00}' .. '\u{1B04}' + | '\u{1B34}' .. '\u{1B44}' | '\u{1B50}' .. '\u{1B59}' | '\u{1B6B}' .. '\u{1B73}' - | '\u{1B80}' .. '\u{1BF3}' - | '\u{1C00}' .. '\u{1C37}' + | '\u{1B80}' .. '\u{1B82}' + | '\u{1BA1}' .. '\u{1BAD}' + | '\u{1BB0}' .. '\u{1BB9}' + | '\u{1BE6}' .. '\u{1BF3}' + | '\u{1C24}' .. '\u{1C37}' | '\u{1C40}' .. '\u{1C49}' - | '\u{1C4D}' .. '\u{1C7D}' - | '\u{1C80}' .. '\u{1C8A}' - | '\u{1C90}' .. '\u{1CBA}' - | '\u{1CBD}' .. '\u{1CBF}' + | '\u{1C50}' .. '\u{1C59}' | '\u{1CD0}' .. '\u{1CD2}' - | '\u{1CD4}' .. '\u{1CFA}' - | '\u{1D00}' .. '\u{1F15}' - | '\u{1F18}' .. '\u{1F1D}' - | '\u{1F20}' .. '\u{1F45}' - | '\u{1F48}' .. '\u{1F4D}' - | '\u{1F50}' .. '\u{1F57}' - | '\u{1F59}' - | '\u{1F5B}' - | '\u{1F5D}' - | '\u{1F5F}' .. '\u{1F7D}' - | '\u{1F80}' .. '\u{1FB4}' - | '\u{1FB6}' .. '\u{1FBC}' - | '\u{1FBE}' - | '\u{1FC2}' .. '\u{1FC4}' - | '\u{1FC6}' .. '\u{1FCC}' - | '\u{1FD0}' .. '\u{1FD3}' - | '\u{1FD6}' .. '\u{1FDB}' - | '\u{1FE0}' .. '\u{1FEC}' - | '\u{1FF2}' .. '\u{1FF4}' - | '\u{1FF6}' .. '\u{1FFC}' + | '\u{1CD4}' .. '\u{1CE8}' + | '\u{1CED}' + | '\u{1CF4}' + | '\u{1CF7}' .. '\u{1CF9}' + | '\u{1DC0}' .. '\u{1DFF}' | '\u{200C}' .. '\u{200D}' | '\u{203F}' .. '\u{2040}' | '\u{2054}' - | '\u{2071}' - | '\u{207F}' - | '\u{2090}' .. '\u{209C}' | '\u{20D0}' .. '\u{20DC}' | '\u{20E1}' | '\u{20E5}' .. '\u{20F0}' - | '\u{2102}' - | '\u{2107}' - | '\u{210A}' .. '\u{2113}' - | '\u{2115}' - | '\u{2118}' .. '\u{211D}' - | '\u{2124}' - | '\u{2126}' - | '\u{2128}' - | '\u{212A}' .. '\u{2139}' - | '\u{213C}' .. '\u{213F}' - | '\u{2145}' .. '\u{2149}' - | '\u{214E}' - | '\u{2160}' .. '\u{2188}' - | '\u{2C00}' .. '\u{2CE4}' - | '\u{2CEB}' .. '\u{2CF3}' - | '\u{2D00}' .. '\u{2D25}' - | '\u{2D27}' - | '\u{2D2D}' - | '\u{2D30}' .. '\u{2D67}' - | '\u{2D6F}' - | '\u{2D7F}' .. '\u{2D96}' - | '\u{2DA0}' .. '\u{2DA6}' - | '\u{2DA8}' .. '\u{2DAE}' - | '\u{2DB0}' .. '\u{2DB6}' - | '\u{2DB8}' .. '\u{2DBE}' - | '\u{2DC0}' .. '\u{2DC6}' - | '\u{2DC8}' .. '\u{2DCE}' - | '\u{2DD0}' .. '\u{2DD6}' - | '\u{2DD8}' .. '\u{2DDE}' + | '\u{2CEF}' .. '\u{2CF1}' + | '\u{2D7F}' | '\u{2DE0}' .. '\u{2DFF}' - | '\u{3005}' .. '\u{3007}' - | '\u{3021}' .. '\u{302F}' - | '\u{3031}' .. '\u{3035}' - | '\u{3038}' .. '\u{303C}' - | '\u{3041}' .. '\u{3096}' + | '\u{302A}' .. '\u{302F}' | '\u{3099}' .. '\u{309A}' - | '\u{309D}' .. '\u{309F}' - | '\u{30A1}' .. '\u{30FF}' - | '\u{3105}' .. '\u{312F}' - | '\u{3131}' .. '\u{318E}' - | '\u{31A0}' .. '\u{31BF}' - | '\u{31F0}' .. '\u{31FF}' - | '\u{3400}' .. '\u{4DBF}' - | '\u{4E00}' .. '\u{A48C}' - | '\u{A4D0}' .. '\u{A4FD}' - | '\u{A500}' .. '\u{A60C}' - | '\u{A610}' .. '\u{A62B}' - | '\u{A640}' .. '\u{A66F}' + | '\u{30FB}' + | '\u{A620}' .. '\u{A629}' + | '\u{A66F}' | '\u{A674}' .. '\u{A67D}' - | '\u{A67F}' .. '\u{A6F1}' - | '\u{A717}' .. '\u{A71F}' - | '\u{A722}' .. '\u{A788}' - | '\u{A78B}' .. '\u{A7CD}' - | '\u{A7D0}' .. '\u{A7D1}' - | '\u{A7D3}' - | '\u{A7D5}' .. '\u{A7DC}' - | '\u{A7F2}' .. '\u{A827}' + | '\u{A69E}' .. '\u{A69F}' + | '\u{A6F0}' .. '\u{A6F1}' + | '\u{A802}' + | '\u{A806}' + | '\u{A80B}' + | '\u{A823}' .. '\u{A827}' | '\u{A82C}' - | '\u{A840}' .. '\u{A873}' - | '\u{A880}' .. '\u{A8C5}' + | '\u{A880}' .. '\u{A881}' + | '\u{A8B4}' .. '\u{A8C5}' | '\u{A8D0}' .. '\u{A8D9}' - | '\u{A8E0}' .. '\u{A8F7}' - | '\u{A8FB}' - | '\u{A8FD}' .. '\u{A92D}' - | '\u{A930}' .. '\u{A953}' - | '\u{A960}' .. '\u{A97C}' - | '\u{A980}' .. '\u{A9C0}' - | '\u{A9CF}' .. '\u{A9D9}' - | '\u{A9E0}' .. '\u{A9FE}' - | '\u{AA00}' .. '\u{AA36}' - | '\u{AA40}' .. '\u{AA4D}' + | '\u{A8E0}' .. '\u{A8F1}' + | '\u{A8FF}' .. '\u{A909}' + | '\u{A926}' .. '\u{A92D}' + | '\u{A947}' .. '\u{A953}' + | '\u{A980}' .. '\u{A983}' + | '\u{A9B3}' .. '\u{A9C0}' + | '\u{A9D0}' .. '\u{A9D9}' + | '\u{A9E5}' + | '\u{A9F0}' .. '\u{A9F9}' + | '\u{AA29}' .. '\u{AA36}' + | '\u{AA43}' + | '\u{AA4C}' .. '\u{AA4D}' | '\u{AA50}' .. '\u{AA59}' - | '\u{AA60}' .. '\u{AA76}' - | '\u{AA7A}' .. '\u{AAC2}' - | '\u{AADB}' .. '\u{AADD}' - | '\u{AAE0}' .. '\u{AAEF}' - | '\u{AAF2}' .. '\u{AAF6}' - | '\u{AB01}' .. '\u{AB06}' - | '\u{AB09}' .. '\u{AB0E}' - | '\u{AB11}' .. '\u{AB16}' - | '\u{AB20}' .. '\u{AB26}' - | '\u{AB28}' .. '\u{AB2E}' - | '\u{AB30}' .. '\u{AB5A}' - | '\u{AB5C}' .. '\u{AB69}' - | '\u{AB70}' .. '\u{ABEA}' + | '\u{AA7B}' .. '\u{AA7D}' + | '\u{AAB0}' + | '\u{AAB2}' .. '\u{AAB4}' + | '\u{AAB7}' .. '\u{AAB8}' + | '\u{AABE}' .. '\u{AABF}' + | '\u{AAC1}' + | '\u{AAEB}' .. '\u{AAEF}' + | '\u{AAF5}' .. '\u{AAF6}' + | '\u{ABE3}' .. '\u{ABEA}' | '\u{ABEC}' .. '\u{ABED}' | '\u{ABF0}' .. '\u{ABF9}' - | '\u{AC00}' .. '\u{D7A3}' - | '\u{D7B0}' .. '\u{D7C6}' - | '\u{D7CB}' .. '\u{D7FB}' - | '\u{F900}' .. '\u{FA6D}' - | '\u{FA70}' .. '\u{FAD9}' - | '\u{FB00}' .. '\u{FB06}' - | '\u{FB13}' .. '\u{FB17}' - | '\u{FB1D}' .. '\u{FB28}' - | '\u{FB2A}' .. '\u{FB36}' - | '\u{FB38}' .. '\u{FB3C}' - | '\u{FB3E}' - | '\u{FB40}' .. '\u{FB41}' - | '\u{FB43}' .. '\u{FB44}' - | '\u{FB46}' .. '\u{FBB1}' - | '\u{FBD3}' .. '\u{FC5D}' - | '\u{FC64}' .. '\u{FD3D}' - | '\u{FD50}' .. '\u{FD8F}' - | '\u{FD92}' .. '\u{FDC7}' - | '\u{FDF0}' .. '\u{FDF9}' + | '\u{FB1E}' | '\u{FE00}' .. '\u{FE0F}' | '\u{FE20}' .. '\u{FE2F}' | '\u{FE33}' .. '\u{FE34}' | '\u{FE4D}' .. '\u{FE4F}' - | '\u{FE71}' - | '\u{FE73}' - | '\u{FE77}' - | '\u{FE79}' - | '\u{FE7B}' - | '\u{FE7D}' - | '\u{FE7F}' .. '\u{FEFC}' | '\u{FF10}' .. '\u{FF19}' - | '\u{FF21}' .. '\u{FF3A}' | '\u{FF3F}' - | '\u{FF41}' .. '\u{FF5A}' - | '\u{FF65}' .. '\u{FFBE}' - | '\u{FFC2}' .. '\u{FFC7}' - | '\u{FFCA}' .. '\u{FFCF}' - | '\u{FFD2}' .. '\u{FFD7}' - | '\u{FFDA}' .. '\u{FFDC}' - | '\u{10000}' .. '\u{1000B}' - | '\u{1000D}' .. '\u{10026}' - | '\u{10028}' .. '\u{1003A}' - | '\u{1003C}' .. '\u{1003D}' - | '\u{1003F}' .. '\u{1004D}' - | '\u{10050}' .. '\u{1005D}' - | '\u{10080}' .. '\u{100FA}' - | '\u{10140}' .. '\u{10174}' + | '\u{FF65}' + | '\u{FF9E}' .. '\u{FF9F}' | '\u{101FD}' - | '\u{10280}' .. '\u{1029C}' - | '\u{102A0}' .. '\u{102D0}' | '\u{102E0}' - | '\u{10300}' .. '\u{1031F}' - | '\u{1032D}' .. '\u{1034A}' - | '\u{10350}' .. '\u{1037A}' - | '\u{10380}' .. '\u{1039D}' - | '\u{103A0}' .. '\u{103C3}' - | '\u{103C8}' .. '\u{103CF}' - | '\u{103D1}' .. '\u{103D5}' - | '\u{10400}' .. '\u{1049D}' + | '\u{10376}' .. '\u{1037A}' | '\u{104A0}' .. '\u{104A9}' - | '\u{104B0}' .. '\u{104D3}' - | '\u{104D8}' .. '\u{104FB}' - | '\u{10500}' .. '\u{10527}' - | '\u{10530}' .. '\u{10563}' - | '\u{10570}' .. '\u{1057A}' - | '\u{1057C}' .. '\u{1058A}' - | '\u{1058C}' .. '\u{10592}' - | '\u{10594}' .. '\u{10595}' - | '\u{10597}' .. '\u{105A1}' - | '\u{105A3}' .. '\u{105B1}' - | '\u{105B3}' .. '\u{105B9}' - | '\u{105BB}' .. '\u{105BC}' - | '\u{105C0}' .. '\u{105F3}' - | '\u{10600}' .. '\u{10736}' - | '\u{10740}' .. '\u{10755}' - | '\u{10760}' .. '\u{10767}' - | '\u{10780}' .. '\u{10785}' - | '\u{10787}' .. '\u{107B0}' - | '\u{107B2}' .. '\u{107BA}' - | '\u{10800}' .. '\u{10805}' - | '\u{10808}' - | '\u{1080A}' .. '\u{10835}' - | '\u{10837}' .. '\u{10838}' - | '\u{1083C}' - | '\u{1083F}' .. '\u{10855}' - | '\u{10860}' .. '\u{10876}' - | '\u{10880}' .. '\u{1089E}' - | '\u{108E0}' .. '\u{108F2}' - | '\u{108F4}' .. '\u{108F5}' - | '\u{10900}' .. '\u{10915}' - | '\u{10920}' .. '\u{10939}' - | '\u{10980}' .. '\u{109B7}' - | '\u{109BE}' .. '\u{109BF}' - | '\u{10A00}' .. '\u{10A03}' + | '\u{10A01}' .. '\u{10A03}' | '\u{10A05}' .. '\u{10A06}' - | '\u{10A0C}' .. '\u{10A13}' - | '\u{10A15}' .. '\u{10A17}' - | '\u{10A19}' .. '\u{10A35}' + | '\u{10A0C}' .. '\u{10A0F}' | '\u{10A38}' .. '\u{10A3A}' | '\u{10A3F}' - | '\u{10A60}' .. '\u{10A7C}' - | '\u{10A80}' .. '\u{10A9C}' - | '\u{10AC0}' .. '\u{10AC7}' - | '\u{10AC9}' .. '\u{10AE6}' - | '\u{10B00}' .. '\u{10B35}' - | '\u{10B40}' .. '\u{10B55}' - | '\u{10B60}' .. '\u{10B72}' - | '\u{10B80}' .. '\u{10B91}' - | '\u{10C00}' .. '\u{10C48}' - | '\u{10C80}' .. '\u{10CB2}' - | '\u{10CC0}' .. '\u{10CF2}' - | '\u{10D00}' .. '\u{10D27}' + | '\u{10AE5}' .. '\u{10AE6}' + | '\u{10D24}' .. '\u{10D27}' | '\u{10D30}' .. '\u{10D39}' - | '\u{10D40}' .. '\u{10D65}' + | '\u{10D40}' .. '\u{10D49}' | '\u{10D69}' .. '\u{10D6D}' - | '\u{10D6F}' .. '\u{10D85}' - | '\u{10E80}' .. '\u{10EA9}' | '\u{10EAB}' .. '\u{10EAC}' - | '\u{10EB0}' .. '\u{10EB1}' - | '\u{10EC2}' .. '\u{10EC4}' - | '\u{10EFC}' .. '\u{10F1C}' - | '\u{10F27}' - | '\u{10F30}' .. '\u{10F50}' - | '\u{10F70}' .. '\u{10F85}' - | '\u{10FB0}' .. '\u{10FC4}' - | '\u{10FE0}' .. '\u{10FF6}' - | '\u{11000}' .. '\u{11046}' - | '\u{11066}' .. '\u{11075}' - | '\u{1107F}' .. '\u{110BA}' + | '\u{10EFC}' .. '\u{10EFF}' + | '\u{10F46}' .. '\u{10F50}' + | '\u{10F82}' .. '\u{10F85}' + | '\u{11000}' .. '\u{11002}' + | '\u{11038}' .. '\u{11046}' + | '\u{11066}' .. '\u{11070}' + | '\u{11073}' .. '\u{11074}' + | '\u{1107F}' .. '\u{11082}' + | '\u{110B0}' .. '\u{110BA}' | '\u{110C2}' - | '\u{110D0}' .. '\u{110E8}' | '\u{110F0}' .. '\u{110F9}' - | '\u{11100}' .. '\u{11134}' + | '\u{11100}' .. '\u{11102}' + | '\u{11127}' .. '\u{11134}' | '\u{11136}' .. '\u{1113F}' - | '\u{11144}' .. '\u{11147}' - | '\u{11150}' .. '\u{11173}' - | '\u{11176}' - | '\u{11180}' .. '\u{111C4}' + | '\u{11145}' .. '\u{11146}' + | '\u{11173}' + | '\u{11180}' .. '\u{11182}' + | '\u{111B3}' .. '\u{111C0}' | '\u{111C9}' .. '\u{111CC}' - | '\u{111CE}' .. '\u{111DA}' - | '\u{111DC}' - | '\u{11200}' .. '\u{11211}' - | '\u{11213}' .. '\u{11237}' - | '\u{1123E}' .. '\u{11241}' - | '\u{11280}' .. '\u{11286}' - | '\u{11288}' - | '\u{1128A}' .. '\u{1128D}' - | '\u{1128F}' .. '\u{1129D}' - | '\u{1129F}' .. '\u{112A8}' - | '\u{112B0}' .. '\u{112EA}' + | '\u{111CE}' .. '\u{111D9}' + | '\u{1122C}' .. '\u{11237}' + | '\u{1123E}' + | '\u{11241}' + | '\u{112DF}' .. '\u{112EA}' | '\u{112F0}' .. '\u{112F9}' | '\u{11300}' .. '\u{11303}' - | '\u{11305}' .. '\u{1130C}' - | '\u{1130F}' .. '\u{11310}' - | '\u{11313}' .. '\u{11328}' - | '\u{1132A}' .. '\u{11330}' - | '\u{11332}' .. '\u{11333}' - | '\u{11335}' .. '\u{11339}' - | '\u{1133B}' .. '\u{11344}' + | '\u{1133B}' .. '\u{1133C}' + | '\u{1133E}' .. '\u{11344}' | '\u{11347}' .. '\u{11348}' | '\u{1134B}' .. '\u{1134D}' - | '\u{11350}' | '\u{11357}' - | '\u{1135D}' .. '\u{11363}' + | '\u{11362}' .. '\u{11363}' | '\u{11366}' .. '\u{1136C}' | '\u{11370}' .. '\u{11374}' - | '\u{11380}' .. '\u{11389}' - | '\u{1138B}' - | '\u{1138E}' - | '\u{11390}' .. '\u{113B5}' - | '\u{113B7}' .. '\u{113C0}' + | '\u{113B8}' .. '\u{113C0}' | '\u{113C2}' | '\u{113C5}' | '\u{113C7}' .. '\u{113CA}' - | '\u{113CC}' .. '\u{113D3}' + | '\u{113CC}' .. '\u{113D0}' + | '\u{113D2}' | '\u{113E1}' .. '\u{113E2}' - | '\u{11400}' .. '\u{1144A}' + | '\u{11435}' .. '\u{11446}' | '\u{11450}' .. '\u{11459}' - | '\u{1145E}' .. '\u{11461}' - | '\u{11480}' .. '\u{114C5}' - | '\u{114C7}' + | '\u{1145E}' + | '\u{114B0}' .. '\u{114C3}' | '\u{114D0}' .. '\u{114D9}' - | '\u{11580}' .. '\u{115B5}' + | '\u{115AF}' .. '\u{115B5}' | '\u{115B8}' .. '\u{115C0}' - | '\u{115D8}' .. '\u{115DD}' - | '\u{11600}' .. '\u{11640}' - | '\u{11644}' + | '\u{115DC}' .. '\u{115DD}' + | '\u{11630}' .. '\u{11640}' | '\u{11650}' .. '\u{11659}' - | '\u{11680}' .. '\u{116B8}' + | '\u{116AB}' .. '\u{116B7}' | '\u{116C0}' .. '\u{116C9}' | '\u{116D0}' .. '\u{116E3}' - | '\u{11700}' .. '\u{1171A}' | '\u{1171D}' .. '\u{1172B}' | '\u{11730}' .. '\u{11739}' - | '\u{11740}' .. '\u{11746}' - | '\u{11800}' .. '\u{1183A}' - | '\u{118A0}' .. '\u{118E9}' - | '\u{118FF}' .. '\u{11906}' - | '\u{11909}' - | '\u{1190C}' .. '\u{11913}' - | '\u{11915}' .. '\u{11916}' - | '\u{11918}' .. '\u{11935}' + | '\u{1182C}' .. '\u{1183A}' + | '\u{118E0}' .. '\u{118E9}' + | '\u{11930}' .. '\u{11935}' | '\u{11937}' .. '\u{11938}' - | '\u{1193B}' .. '\u{11943}' + | '\u{1193B}' .. '\u{1193E}' + | '\u{11940}' + | '\u{11942}' .. '\u{11943}' | '\u{11950}' .. '\u{11959}' - | '\u{119A0}' .. '\u{119A7}' - | '\u{119AA}' .. '\u{119D7}' - | '\u{119DA}' .. '\u{119E1}' - | '\u{119E3}' .. '\u{119E4}' - | '\u{11A00}' .. '\u{11A3E}' + | '\u{119D1}' .. '\u{119D7}' + | '\u{119DA}' .. '\u{119E0}' + | '\u{119E4}' + | '\u{11A01}' .. '\u{11A0A}' + | '\u{11A33}' .. '\u{11A39}' + | '\u{11A3B}' .. '\u{11A3E}' | '\u{11A47}' - | '\u{11A50}' .. '\u{11A99}' - | '\u{11A9D}' - | '\u{11AB0}' .. '\u{11AF8}' - | '\u{11BC0}' .. '\u{11BE0}' + | '\u{11A51}' .. '\u{11A5B}' + | '\u{11A8A}' .. '\u{11A99}' | '\u{11BF0}' .. '\u{11BF9}' - | '\u{11C00}' .. '\u{11C08}' - | '\u{11C0A}' .. '\u{11C36}' - | '\u{11C38}' .. '\u{11C40}' + | '\u{11C2F}' .. '\u{11C36}' + | '\u{11C38}' .. '\u{11C3F}' | '\u{11C50}' .. '\u{11C59}' - | '\u{11C72}' .. '\u{11C8F}' | '\u{11C92}' .. '\u{11CA7}' | '\u{11CA9}' .. '\u{11CB6}' - | '\u{11D00}' .. '\u{11D06}' - | '\u{11D08}' .. '\u{11D09}' - | '\u{11D0B}' .. '\u{11D36}' + | '\u{11D31}' .. '\u{11D36}' | '\u{11D3A}' | '\u{11D3C}' .. '\u{11D3D}' - | '\u{11D3F}' .. '\u{11D47}' + | '\u{11D3F}' .. '\u{11D45}' + | '\u{11D47}' | '\u{11D50}' .. '\u{11D59}' - | '\u{11D60}' .. '\u{11D65}' - | '\u{11D67}' .. '\u{11D68}' - | '\u{11D6A}' .. '\u{11D8E}' + | '\u{11D8A}' .. '\u{11D8E}' | '\u{11D90}' .. '\u{11D91}' - | '\u{11D93}' .. '\u{11D98}' + | '\u{11D93}' .. '\u{11D97}' | '\u{11DA0}' .. '\u{11DA9}' - | '\u{11EE0}' .. '\u{11EF6}' - | '\u{11F00}' .. '\u{11F10}' - | '\u{11F12}' .. '\u{11F3A}' + | '\u{11EF3}' .. '\u{11EF6}' + | '\u{11F00}' .. '\u{11F01}' + | '\u{11F03}' + | '\u{11F34}' .. '\u{11F3A}' | '\u{11F3E}' .. '\u{11F42}' | '\u{11F50}' .. '\u{11F5A}' - | '\u{11FB0}' - | '\u{12000}' .. '\u{12399}' - | '\u{12400}' .. '\u{1246E}' - | '\u{12480}' .. '\u{12543}' - | '\u{12F90}' .. '\u{12FF0}' - | '\u{13000}' .. '\u{1342F}' - | '\u{13440}' .. '\u{13455}' - | '\u{13460}' .. '\u{143FA}' - | '\u{14400}' .. '\u{14646}' - | '\u{16100}' .. '\u{16139}' - | '\u{16800}' .. '\u{16A38}' - | '\u{16A40}' .. '\u{16A5E}' + | '\u{13440}' + | '\u{13447}' .. '\u{13455}' + | '\u{1611E}' .. '\u{16139}' | '\u{16A60}' .. '\u{16A69}' - | '\u{16A70}' .. '\u{16ABE}' | '\u{16AC0}' .. '\u{16AC9}' - | '\u{16AD0}' .. '\u{16AED}' | '\u{16AF0}' .. '\u{16AF4}' - | '\u{16B00}' .. '\u{16B36}' - | '\u{16B40}' .. '\u{16B43}' + | '\u{16B30}' .. '\u{16B36}' | '\u{16B50}' .. '\u{16B59}' - | '\u{16B63}' .. '\u{16B77}' - | '\u{16B7D}' .. '\u{16B8F}' - | '\u{16D40}' .. '\u{16D6C}' | '\u{16D70}' .. '\u{16D79}' - | '\u{16E40}' .. '\u{16E7F}' - | '\u{16F00}' .. '\u{16F4A}' - | '\u{16F4F}' .. '\u{16F87}' - | '\u{16F8F}' .. '\u{16F9F}' - | '\u{16FE0}' .. '\u{16FE1}' - | '\u{16FE3}' .. '\u{16FE4}' + | '\u{16F4F}' + | '\u{16F51}' .. '\u{16F87}' + | '\u{16F8F}' .. '\u{16F92}' + | '\u{16FE4}' | '\u{16FF0}' .. '\u{16FF1}' - | '\u{17000}' .. '\u{187F7}' - | '\u{18800}' .. '\u{18CD5}' - | '\u{18CFF}' .. '\u{18D08}' - | '\u{1AFF0}' .. '\u{1AFF3}' - | '\u{1AFF5}' .. '\u{1AFFB}' - | '\u{1AFFD}' .. '\u{1AFFE}' - | '\u{1B000}' .. '\u{1B122}' - | '\u{1B132}' - | '\u{1B150}' .. '\u{1B152}' - | '\u{1B155}' - | '\u{1B164}' .. '\u{1B167}' - | '\u{1B170}' .. '\u{1B2FB}' - | '\u{1BC00}' .. '\u{1BC6A}' - | '\u{1BC70}' .. '\u{1BC7C}' - | '\u{1BC80}' .. '\u{1BC88}' - | '\u{1BC90}' .. '\u{1BC99}' | '\u{1BC9D}' .. '\u{1BC9E}' | '\u{1CCF0}' .. '\u{1CCF9}' | '\u{1CF00}' .. '\u{1CF2D}' @@ -1271,36 +945,6 @@ fragment ID_CONTINUE // for Python 3.14.2 | '\u{1D185}' .. '\u{1D18B}' | '\u{1D1AA}' .. '\u{1D1AD}' | '\u{1D242}' .. '\u{1D244}' - | '\u{1D400}' .. '\u{1D454}' - | '\u{1D456}' .. '\u{1D49C}' - | '\u{1D49E}' .. '\u{1D49F}' - | '\u{1D4A2}' - | '\u{1D4A5}' .. '\u{1D4A6}' - | '\u{1D4A9}' .. '\u{1D4AC}' - | '\u{1D4AE}' .. '\u{1D4B9}' - | '\u{1D4BB}' - | '\u{1D4BD}' .. '\u{1D4C3}' - | '\u{1D4C5}' .. '\u{1D505}' - | '\u{1D507}' .. '\u{1D50A}' - | '\u{1D50D}' .. '\u{1D514}' - | '\u{1D516}' .. '\u{1D51C}' - | '\u{1D51E}' .. '\u{1D539}' - | '\u{1D53B}' .. '\u{1D53E}' - | '\u{1D540}' .. '\u{1D544}' - | '\u{1D546}' - | '\u{1D54A}' .. '\u{1D550}' - | '\u{1D552}' .. '\u{1D6A5}' - | '\u{1D6A8}' .. '\u{1D6C0}' - | '\u{1D6C2}' .. '\u{1D6DA}' - | '\u{1D6DC}' .. '\u{1D6FA}' - | '\u{1D6FC}' .. '\u{1D714}' - | '\u{1D716}' .. '\u{1D734}' - | '\u{1D736}' .. '\u{1D74E}' - | '\u{1D750}' .. '\u{1D76E}' - | '\u{1D770}' .. '\u{1D788}' - | '\u{1D78A}' .. '\u{1D7A8}' - | '\u{1D7AA}' .. '\u{1D7C2}' - | '\u{1D7C4}' .. '\u{1D7CB}' | '\u{1D7CE}' .. '\u{1D7FF}' | '\u{1DA00}' .. '\u{1DA36}' | '\u{1DA3B}' .. '\u{1DA6C}' @@ -1308,74 +952,23 @@ fragment ID_CONTINUE // for Python 3.14.2 | '\u{1DA84}' | '\u{1DA9B}' .. '\u{1DA9F}' | '\u{1DAA1}' .. '\u{1DAAF}' - | '\u{1DF00}' .. '\u{1DF1E}' - | '\u{1DF25}' .. '\u{1DF2A}' | '\u{1E000}' .. '\u{1E006}' | '\u{1E008}' .. '\u{1E018}' | '\u{1E01B}' .. '\u{1E021}' | '\u{1E023}' .. '\u{1E024}' | '\u{1E026}' .. '\u{1E02A}' - | '\u{1E030}' .. '\u{1E06D}' | '\u{1E08F}' - | '\u{1E100}' .. '\u{1E12C}' - | '\u{1E130}' .. '\u{1E13D}' + | '\u{1E130}' .. '\u{1E136}' | '\u{1E140}' .. '\u{1E149}' - | '\u{1E14E}' - | '\u{1E290}' .. '\u{1E2AE}' - | '\u{1E2C0}' .. '\u{1E2F9}' - | '\u{1E4D0}' .. '\u{1E4F9}' - | '\u{1E5D0}' .. '\u{1E5FA}' - | '\u{1E7E0}' .. '\u{1E7E6}' - | '\u{1E7E8}' .. '\u{1E7EB}' - | '\u{1E7ED}' .. '\u{1E7EE}' - | '\u{1E7F0}' .. '\u{1E7FE}' - | '\u{1E800}' .. '\u{1E8C4}' + | '\u{1E2AE}' + | '\u{1E2EC}' .. '\u{1E2F9}' + | '\u{1E4EC}' .. '\u{1E4F9}' + | '\u{1E5EE}' .. '\u{1E5EF}' + | '\u{1E5F1}' .. '\u{1E5FA}' | '\u{1E8D0}' .. '\u{1E8D6}' - | '\u{1E900}' .. '\u{1E94B}' + | '\u{1E944}' .. '\u{1E94A}' | '\u{1E950}' .. '\u{1E959}' - | '\u{1EE00}' .. '\u{1EE03}' - | '\u{1EE05}' .. '\u{1EE1F}' - | '\u{1EE21}' .. '\u{1EE22}' - | '\u{1EE24}' - | '\u{1EE27}' - | '\u{1EE29}' .. '\u{1EE32}' - | '\u{1EE34}' .. '\u{1EE37}' - | '\u{1EE39}' - | '\u{1EE3B}' - | '\u{1EE42}' - | '\u{1EE47}' - | '\u{1EE49}' - | '\u{1EE4B}' - | '\u{1EE4D}' .. '\u{1EE4F}' - | '\u{1EE51}' .. '\u{1EE52}' - | '\u{1EE54}' - | '\u{1EE57}' - | '\u{1EE59}' - | '\u{1EE5B}' - | '\u{1EE5D}' - | '\u{1EE5F}' - | '\u{1EE61}' .. '\u{1EE62}' - | '\u{1EE64}' - | '\u{1EE67}' .. '\u{1EE6A}' - | '\u{1EE6C}' .. '\u{1EE72}' - | '\u{1EE74}' .. '\u{1EE77}' - | '\u{1EE79}' .. '\u{1EE7C}' - | '\u{1EE7E}' - | '\u{1EE80}' .. '\u{1EE89}' - | '\u{1EE8B}' .. '\u{1EE9B}' - | '\u{1EEA1}' .. '\u{1EEA3}' - | '\u{1EEA5}' .. '\u{1EEA9}' - | '\u{1EEAB}' .. '\u{1EEBB}' | '\u{1FBF0}' .. '\u{1FBF9}' - | '\u{20000}' .. '\u{2A6DF}' - | '\u{2A700}' .. '\u{2B739}' - | '\u{2B740}' .. '\u{2B81D}' - | '\u{2B820}' .. '\u{2CEA1}' - | '\u{2CEB0}' .. '\u{2EBE0}' - | '\u{2EBF0}' .. '\u{2EE5D}' - | '\u{2F800}' .. '\u{2FA1D}' - | '\u{30000}' .. '\u{3134A}' - | '\u{31350}' .. '\u{323AF}' | '\u{E0100}' .. '\u{E01EF}' ;