diff --git a/python/pom.xml b/python/pom.xml
index c99982bdfa..d0deb48fa2 100644
--- a/python/pom.xml
+++ b/python/pom.xml
@@ -16,6 +16,6 @@
python2
python3
python2_7_18
- python3_13
+ python3_14
diff --git a/python/python3_13/CSharp/PythonLexerBase.cs b/python/python3_13/CSharp/PythonLexerBase.cs
deleted file mode 100644
index bbaf1bc678..0000000000
--- a/python/python3_13/CSharp/PythonLexerBase.cs
+++ /dev/null
@@ -1,802 +0,0 @@
-/*
-The MIT License (MIT)
-Copyright (c) 2021 Robert Einhorn
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
- */
-
-/*
- * Project : Python Indent/Dedent handler for ANTLR4 grammars
- *
- * Developed by : Robert Einhorn
- */
-
-#nullable enable
-using Antlr4.Runtime;
-using System;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.IO;
-using System.Collections.Generic;
-
-[assembly: CLSCompliant(true)]
-
-public abstract class PythonLexerBase : Lexer
-{
- // A stack that keeps track of the indentation lengths
- private Stack indentLengthStack = new();
- // A list where tokens are waiting to be loaded into the token stream
- private LinkedList pendingTokens = new();
-
- // last pending token type
- private int previousPendingTokenType;
- private int lastPendingTokenTypeFromDefaultChannel;
-
- // The amount of opened parentheses, square brackets, or curly braces
- private int opened;
- // The amount of opened parentheses and square brackets in the current lexer mode
- private Stack paren_or_bracket_openedStack = new();
- // A stack that stores expression(s) between braces in fstring
- private Stack braceExpressionStack = new();
- private string prevBraceExpression = "";
-
- // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime)
- private int curLexerMode;
- // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime)
- private Stack lexerModeStack = new();
-
- private bool wasSpaceIndentation;
- private bool wasTabIndentation;
- private bool wasIndentationMixedWithSpacesAndTabs;
-
- private IToken curToken = null!; // current (under processing) token
- private IToken ffgToken = null!; // following (look ahead) token
-
- private const int INVALID_LENGTH = -1;
- private const string ERR_TXT = " ERROR: ";
-
- protected PythonLexerBase(ICharStream input) : base(input)
- {
- }
-
- protected PythonLexerBase(ICharStream input, TextWriter output, TextWriter errorOutput) : base(input, output, errorOutput)
- {
- }
-
- public override IToken NextToken() // reading the intStream stream until a return EOF
- {
- this.CheckNextToken();
- IToken firstPendingToken = this.pendingTokens.First!.Value;
- this.pendingTokens.RemoveFirst();
- return firstPendingToken; // add the queued token to the token stream
- }
-
- public override void Reset()
- {
- this.Init();
- base.Reset();
- }
-
- private void Init()
- {
- this.indentLengthStack = new();
- this.pendingTokens = new();
- this.previousPendingTokenType = 0;
- this.lastPendingTokenTypeFromDefaultChannel = 0;
- this.opened = 0;
- this.paren_or_bracket_openedStack = new();
- this.braceExpressionStack = new();
- this.prevBraceExpression = "";
- this.curLexerMode = 0;
- this.lexerModeStack = new();
- this.wasSpaceIndentation = false;
- this.wasTabIndentation = false;
- this.wasIndentationMixedWithSpacesAndTabs = false;
- this.curToken = null!;
- this.ffgToken = null!;
- }
-
- private void CheckNextToken()
- {
- if (this.previousPendingTokenType == TokenConstants.EOF)
- return;
-
- if (this.indentLengthStack.Count == 0) // We're at the first token
- {
- this.InsertENCODINGtoken();
- this.SetCurrentAndFollowingTokens();
- this.HandleStartOfInput();
- }
- else
- {
- this.SetCurrentAndFollowingTokens();
- }
-
-
- switch (this.curToken.Type)
- {
- case PythonLexer.NEWLINE:
- this.HandleNEWLINEtoken();
- break;
- case PythonLexer.LPAR:
- case PythonLexer.LSQB:
- case PythonLexer.LBRACE:
- this.opened++;
- this.AddPendingToken(this.curToken);
- break;
- case PythonLexer.RPAR:
- case PythonLexer.RSQB:
- case PythonLexer.RBRACE:
- this.opened--;
- this.AddPendingToken(this.curToken);
- break;
- case PythonLexer.FSTRING_MIDDLE:
- this.HandleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field
- this.AddPendingToken(this.curToken);
- break;
- case PythonLexer.COLONEQUAL:
- this.HandleCOLONEQUALtokenInFString();
- break;
- case PythonLexer.ERRORTOKEN:
- this.ReportLexerError("token recognition error at: '" + this.curToken.Text + "'");
- this.AddPendingToken(this.curToken);
- break;
- case TokenConstants.EOF:
- this.HandleEOFtoken();
- break;
- default:
- this.AddPendingToken(this.curToken);
- break;
- }
- this.HandleFORMAT_SPECIFICATION_MODE();
- }
-
- private void SetCurrentAndFollowingTokens()
- {
- this.curToken = this.ffgToken == null ?
- base.NextToken() :
- this.ffgToken;
-
- this.CheckCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)!
-
- this.ffgToken = this.curToken.Type == TokenConstants.EOF ?
- this.curToken :
- base.NextToken();
- }
-
- private void InsertENCODINGtoken() // https://peps.python.org/pep-0263/
- {
- var lineBuilder = new StringBuilder();
- var encodingName = "";
- var lineCount = 0;
- var ws_commentPattern = new Regex("^[ \t\f]*(#.*)?$");
- var intStream = this.InputStream;
- var size = intStream.Size;
-
- intStream.Seek(0);
- for (int i = 0; i < size; i++)
- {
- char c = (char)intStream.LA(i + 1);
- lineBuilder.Append(c);
-
- if (c == '\n' || i == size - 1)
- {
- string line = lineBuilder.ToString().Replace("\r", "").Replace("\n", "");
- if (ws_commentPattern.IsMatch(line)) // WS* + COMMENT? found
- {
- encodingName = GetEncodingName(line);
- if (encodingName != "")
- {
- break; // encoding found
- }
- }
- else
- {
- break; // statement or backslash found (line is not empty, not whitespace(s), not comment)
- }
-
- lineCount++;
- if (lineCount >= 2)
- {
- break; // check only the first two lines
- }
- lineBuilder.Clear();
- }
- }
-
- if (encodingName == "")
- {
- encodingName = "utf-8"; // default Python source code encoding
- }
-
- var encodingToken = new CommonToken(PythonLexer.ENCODING, encodingName);
- encodingToken.Channel = TokenConstants.HiddenChannel;
- encodingToken.StartIndex = 0;
- encodingToken.StopIndex = 0;
- encodingToken.Line = 0;
- encodingToken.Column = -1;
- AddPendingToken(encodingToken);
- }
-
- private static string GetEncodingName(string commentText) // https://peps.python.org/pep-0263/#defining-the-encoding
- {
- var encodingCommentPattern = new Regex("^[ \\t\\f]*#.*?coding[:=][ \\t]*([-_.a-zA-Z0-9]+)");
- var match = encodingCommentPattern.Match(commentText);
- return match.Success ? match.Groups[1].Value : string.Empty;
- }
-
- // initialize the _indentLengths
- // hide the leading NEWLINE token(s)
- // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
- // insert a leading INDENT token if necessary
- private void HandleStartOfInput()
- {
- // initialize the stack with a default 0 indentation length
- this.indentLengthStack.Push(0); // this will never be popped off
- while (this.curToken.Type != TokenConstants.EOF)
- {
- if (this.curToken.Channel == TokenConstants.DefaultChannel)
- {
- if (this.curToken.Type == PythonLexer.NEWLINE)
- {
- // all the NEWLINE tokens must be ignored before the first statement
- this.HideAndAddPendingToken(this.curToken);
- }
- else
- { // We're at the first statement
- this.InsertLeadingIndentToken();
- return; // continue the processing of the current token with CheckNextToken()
- }
- }
- else
- {
- this.AddPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING, or COMMENT token
- }
- this.SetCurrentAndFollowingTokens();
- } // continue the processing of the EOF token with CheckNextToken()
- }
-
- private void InsertLeadingIndentToken()
- {
- if (this.previousPendingTokenType == PythonLexer.WS)
- {
- IToken prevToken = this.pendingTokens.Last!.Value;
- if (this.GetIndentationLength(prevToken.Text) != 0) // there is an "indentation" before the first statement
- {
- const string errMsg = "first statement indented";
- this.ReportLexerError(errMsg);
- // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- this.CreateAndAddPendingToken(PythonLexer.INDENT, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.curToken);
- }
- }
- }
-
- private void HandleNEWLINEtoken()
- {
- if (this.lexerModeStack.Count > 0)
- {
- this.AddPendingToken(this.curToken);
- }
- else if (this.opened > 0)
- {
- // We're in an implicit line joining, ignore the current NEWLINE token
- this.HideAndAddPendingToken(this.curToken);
- }
- else
- {
- IToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
- bool isLookingAhead = this.ffgToken.Type == PythonLexer.WS;
- if (isLookingAhead)
- {
- this.SetCurrentAndFollowingTokens(); // set the next two tokens
- }
-
- switch (this.ffgToken.Type)
- {
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
- this.HideAndAddPendingToken(nlToken);
- if (isLookingAhead)
- {
- this.AddPendingToken(this.curToken); // WS token
- }
- break;
- default:
- this.AddPendingToken(nlToken);
- if (isLookingAhead)
- { // We're on a whitespace(s) followed by a statement
- int indentationLength = this.ffgToken.Type == TokenConstants.EOF ?
- 0 :
- this.GetIndentationLength(this.curToken.Text);
-
- if (indentationLength != PythonLexerBase.INVALID_LENGTH)
- {
- this.AddPendingToken(this.curToken); // WS token
- this.InsertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
- }
- else
- {
- this.ReportError("inconsistent use of tabs and spaces in indentation");
- }
- }
- else
- {
- // We're at a newline followed by a statement (there is no whitespace before the statement)
- this.InsertIndentOrDedentToken(0); // may insert DEDENT token(s)
- }
- break;
- }
- }
- }
-
- private void InsertIndentOrDedentToken(int indentLength)
- {
- int prevIndentLength = this.indentLengthStack.Peek();
- if (indentLength > prevIndentLength)
- {
- this.CreateAndAddPendingToken(PythonLexer.INDENT, TokenConstants.DefaultChannel, null, this.ffgToken);
- this.indentLengthStack.Push(indentLength);
- }
- else
- {
- while (indentLength < prevIndentLength)
- { // more than 1 DEDENT token may be inserted into the token stream
- this.indentLengthStack.Pop();
- prevIndentLength = this.indentLengthStack.Peek();
- if (indentLength <= prevIndentLength)
- {
- this.CreateAndAddPendingToken(PythonLexer.DEDENT, TokenConstants.DefaultChannel, null, this.ffgToken);
- }
- else
- {
- this.ReportError("inconsistent dedent");
- }
- }
- }
- }
-
- private void CheckCurToken()
- {
- switch (this.curToken.Type)
- {
- case PythonLexer.FSTRING_START:
- this.SetLexerModeByFSTRING_STARTtoken();
- return;
- case PythonLexer.FSTRING_MIDDLE:
- this.HandleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field
- if (this.curToken.Type == PythonLexer.FSTRING_MIDDLE)
- return; // No curToken exchange happened
- break;
- case PythonLexer.FSTRING_END:
- this.PopLexerMode();
- return;
- default:
- if (this.lexerModeStack.Count == 0)
- return; // Not in fstring mode
- break;
- }
-
- switch (this.curToken.Type)
- {
- case PythonLexer.NEWLINE:
- // append the current brace expression with the current newline
- this.AppendToBraceExpression(this.curToken.Text);
- var ctkn = new CommonToken(this.curToken);
- ctkn.Channel = TokenConstants.HiddenChannel;
- this.curToken = ctkn;
- break;
- case PythonLexer.LBRACE:
- // the outermost brace expression cannot be a dictionary comprehension or a set comprehension
- this.braceExpressionStack.Push("{");
- this.paren_or_bracket_openedStack.Push(0);
- this.PushLexerMode(Lexer.DEFAULT_MODE);
- break;
- case PythonLexer.LPAR:
- case PythonLexer.LSQB:
- // append the current brace expression with a "(" or a "["
- this.AppendToBraceExpression(this.curToken.Text);
- // https://peps.python.org/pep-0498/#lambdas-inside-expressions
- this.IncrementBraceStack();
- break;
- case PythonLexer.RPAR:
- case PythonLexer.RSQB:
- // append the current brace expression with a ")" or a "]"
- this.AppendToBraceExpression(this.curToken.Text);
- this.DecrementBraceStack();
- break;
- case PythonLexer.COLON:
- case PythonLexer.COLONEQUAL:
- // append the current brace expression with a ":" or a ":="
- this.AppendToBraceExpression(this.curToken.Text);
- this.SetLexerModeByCOLONorCOLONEQUALtoken();
- break;
- case PythonLexer.RBRACE:
- this.SetLexerModeAfterRBRACEtoken();
- break;
- default:
- // append the current brace expression with the current token text
- this.AppendToBraceExpression(this.curToken.Text);
- break;
- }
- }
-
- private void AppendToBraceExpression(string text)
- {
- this.braceExpressionStack.Push(this.braceExpressionStack.Pop() + text);
- }
-
- private void IncrementBraceStack()
- { // increment the last element (peek() + 1)
- this.paren_or_bracket_openedStack.Push(this.paren_or_bracket_openedStack.Pop() + 1);
- }
-
- private void DecrementBraceStack()
- { // decrement the last element (peek() - 1)
- this.paren_or_bracket_openedStack.Push(this.paren_or_bracket_openedStack.Pop() - 1);
- }
-
- private void SetLexerModeAfterRBRACEtoken()
- {
- switch (this.curLexerMode)
- {
- case Lexer.DEFAULT_MODE:
- this.PopLexerMode();
- this.PopByBRACE();
- break;
- case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE:
- this.PopLexerMode();
- this.PopLexerMode();
- this.PopByBRACE();
- break;
- default:
- this.ReportLexerError("f-string: single '}' is not allowed");
- break;
- }
- }
-
- private void SetLexerModeByFSTRING_STARTtoken()
- {
- string text = this.curToken.Text.ToLower();
- var modeMap = new Dictionary
- {
- { "f'", PythonLexer.SQ1__FSTRING_MODE },
- { "rf'", PythonLexer.SQ1R_FSTRING_MODE },
- { "fr'", PythonLexer.SQ1R_FSTRING_MODE },
- { "f\"", PythonLexer.DQ1__FSTRING_MODE },
- { "rf\"", PythonLexer.DQ1R_FSTRING_MODE },
- { "fr\"", PythonLexer.DQ1R_FSTRING_MODE },
- { "f'''", PythonLexer.SQ3__FSTRING_MODE },
- { "rf'''", PythonLexer.SQ3R_FSTRING_MODE },
- { "fr'''", PythonLexer.SQ3R_FSTRING_MODE },
- { "f\"\"\"", PythonLexer.DQ3__FSTRING_MODE },
- { "rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE },
- { "fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE }
- };
-
- if (modeMap.TryGetValue(text, out int mode))
- {
- this.PushLexerMode(mode);
- }
- }
-
- private void SetLexerModeByCOLONorCOLONEQUALtoken()
- {
- if (this.paren_or_bracket_openedStack.Peek() == 0)
- {
- // COLONEQUAL token will be replaced with a COLON token in CheckNextToken()
- switch (this.lexerModeStack.Peek())
- { // check the previous lexer mode (the current is DEFAULT_MODE)
- case PythonLexer.SQ1__FSTRING_MODE:
- case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE:
- this.PushLexerMode(PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ1R_FSTRING_MODE:
- case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE:
- this.PushLexerMode(PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ1__FSTRING_MODE:
- case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE:
- this.PushLexerMode(PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ1R_FSTRING_MODE:
- case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE:
- this.PushLexerMode(PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ3__FSTRING_MODE:
- case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE:
- this.PushLexerMode(PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ3R_FSTRING_MODE:
- case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE:
- this.PushLexerMode(PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ3__FSTRING_MODE:
- case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE:
- this.PushLexerMode(PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ3R_FSTRING_MODE:
- case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE:
- this.PushLexerMode(PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- }
- }
- }
-
- private void PopByBRACE()
- {
- this.paren_or_bracket_openedStack.Pop();
- this.prevBraceExpression = this.braceExpressionStack.Pop() + "}";
- if (this.braceExpressionStack.Count > 0)
- {
- // append the current brace expression with the previous brace expression
- this.braceExpressionStack.Push(this.braceExpressionStack.Pop() + this.prevBraceExpression);
- }
-
- }
-
- private void HandleFSTRING_MIDDLEtokenWithDoubleBrace()
- {
- // replace the trailing double brace with a single brace and insert a hidden brace token
- switch (this.GetLastTwoCharsOfTheCurTokenText())
- {
- case "{{":
- this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", TokenConstants.HiddenChannel);
- break;
- case "}}":
- this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", TokenConstants.HiddenChannel);
- break;
- }
- }
-
- private void HandleFSTRING_MIDDLEtokenWithQuoteAndLBrace()
- {
- // replace the trailing quote + left_brace with a quote and insert an LBRACE token
- // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
- switch (this.GetLastTwoCharsOfTheCurTokenText())
- {
- case "\"{":
- case "'{":
- case "\\{":
- this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", TokenConstants.DefaultChannel);
- break;
- }
- }
-
- private string GetLastTwoCharsOfTheCurTokenText()
- {
- string curTokenText = this.curToken.Text;
- return curTokenText.Length >= 2 ? curTokenText.Substring(curTokenText.Length - 2) : curTokenText;
- }
-
- private void TrimLastCharAddPendingTokenSetCurToken(int type, string text, int channel)
- {
- // trim the last char and add the modified curToken to the pendingTokens stack
- string curTokenText = this.curToken.Text;
- string tokenTextWithoutLastChar = curTokenText.Substring(0, curTokenText.Length - 1);
- var ctkn = new CommonToken(this.curToken);
- ctkn.Text = tokenTextWithoutLastChar;
- ctkn.StopIndex = ctkn.StopIndex - 1;
- this.AddPendingToken(ctkn);
-
- this.CreateNewCurToken(type, text, channel); // set curToken
- }
-
- private void HandleCOLONEQUALtokenInFString()
- {
- if (this.lexerModeStack.Count > 0 &&
- this.paren_or_bracket_openedStack.Peek() == 0)
- {
- // In fstring a colonequal (walrus operator) can only be used in parentheses
- // Not in parentheses, replace COLONEQUAL token with COLON as format specifier
- // and insert the equal symbol to the following FSTRING_MIDDLE token
- var ctkn = new CommonToken(this.curToken);
- ctkn.Type = PythonLexer.COLON;
- ctkn.Text = ":";
- ctkn.StopIndex = ctkn.StartIndex;
- this.curToken = ctkn;
- if (this.ffgToken.Type == PythonLexer.FSTRING_MIDDLE)
- {
- ctkn = new CommonToken(this.ffgToken);
- ctkn.Text = "=" + ctkn.Text;
- ctkn.StartIndex -= 1;
- ctkn.Column -= 1;
- this.ffgToken = ctkn;
- }
- else
- {
- this.AddPendingToken(this.curToken);
- this.CreateNewCurToken(PythonLexer.FSTRING_MIDDLE, "=", TokenConstants.DefaultChannel);
- }
- }
- this.AddPendingToken(this.curToken);
- }
-
- private void CreateNewCurToken(int type, string text, int channel)
- {
- var ctkn = new CommonToken(this.curToken);
- ctkn.Type = type;
- ctkn.Text = text;
- ctkn.Channel = channel;
- ctkn.Column += 1;
- ctkn.StartIndex += 1;
- ctkn.StopIndex = ctkn.StartIndex;
- this.curToken = ctkn;
- }
-
- private void PushLexerMode(int mode)
- {
- this.PushMode(mode);
- this.lexerModeStack.Push(this.curLexerMode);
- this.curLexerMode = mode;
- }
-
- private void PopLexerMode()
- {
- this.PopMode();
- this.curLexerMode = this.lexerModeStack.Pop();
- }
-
- private void HandleFORMAT_SPECIFICATION_MODE()
- {
- if (this.lexerModeStack.Count > 0
- && this.ffgToken.Type == PythonLexer.RBRACE)
- {
- // insert an empty FSTRING_MIDDLE token instead of the missing format specification
- switch (this.curToken.Type)
- {
- case PythonLexer.COLON:
- this.CreateAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, TokenConstants.DefaultChannel, "", this.ffgToken);
- break;
- case PythonLexer.RBRACE:
- // only if the previous brace expression is not a dictionary comprehension or set comprehension
- if (!IsDictionaryComprehensionOrSetComprehension(this.prevBraceExpression))
- {
- this.CreateAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, TokenConstants.DefaultChannel, "", this.ffgToken);
- }
- break;
- }
- }
- }
-
- private static bool IsDictionaryComprehensionOrSetComprehension(string code)
- {
- var inputStream = CharStreams.fromString(code);
- var lexer = new PythonLexer(inputStream);
- var tokenStream = new CommonTokenStream(lexer);
- var parser = new PythonParser(tokenStream);
-
- // Disable error listeners to suppress console output
- lexer.RemoveErrorListeners();
- parser.RemoveErrorListeners();
-
- parser.dictcomp(); // Try parsing as dictionary comprehension
- if (parser.NumberOfSyntaxErrors == 0)
- return true;
-
- parser = new PythonParser(tokenStream);
- tokenStream.Seek(0);
- parser.RemoveErrorListeners();
- parser.setcomp(); // Try parsing as set comprehension
- return parser.NumberOfSyntaxErrors == 0;
- }
-
- private void InsertTrailingTokens()
- {
- switch (this.lastPendingTokenTypeFromDefaultChannel)
- {
- case PythonLexer.NEWLINE:
- case PythonLexer.DEDENT:
- break; // no trailing NEWLINE token is needed
- default:
- // insert an extra trailing NEWLINE token that serves as the end of the last statement
- this.CreateAndAddPendingToken(PythonLexer.NEWLINE, TokenConstants.DefaultChannel, null, this.ffgToken); // ffgToken is EOF
- break;
- }
- this.InsertIndentOrDedentToken(0); // Now insert as many trailing DEDENT tokens as needed
- }
-
- private void HandleEOFtoken()
- {
- if (this.lastPendingTokenTypeFromDefaultChannel > 0)
- { // there was a statement in the intStream (leading NEWLINE tokens are hidden)
- this.InsertTrailingTokens();
- }
- this.AddPendingToken(this.curToken);
- }
-
- private void HideAndAddPendingToken(IToken tkn)
- {
- var ctkn = new CommonToken(tkn);
- ctkn.Channel = TokenConstants.HiddenChannel;
- this.AddPendingToken(ctkn);
- }
-
- private void CreateAndAddPendingToken(int ttype, int channel, string? text, IToken sampleToken)
- {
- var ctkn = new CommonToken(sampleToken);
- ctkn.Type = ttype;
- ctkn.Channel = channel;
- ctkn.StopIndex = sampleToken.StartIndex - 1;
- ctkn.Text = text ?? "<" + this.Vocabulary.GetSymbolicName(ttype) + ">";
-
- this.AddPendingToken(ctkn);
- }
-
- private void AddPendingToken(IToken tkn)
- {
- // save the last pending token type because the pendingTokens list can be empty by the nextToken()
- this.previousPendingTokenType = tkn.Type;
- if (tkn.Channel == TokenConstants.DefaultChannel)
- {
- this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
- }
- this.pendingTokens.AddLast(tkn);
- }
-
- private int GetIndentationLength(string indentText) // the indentText may contain spaces, tabs or form feeds
- {
- const int TAB_LENGTH = 8; // the standard number of spaces to replace a tab with spaces
- int length = 0;
- foreach (char ch in indentText)
- {
- switch (ch)
- {
- case ' ':
- this.wasSpaceIndentation = true;
- length += 1;
- break;
- case '\t':
- this.wasTabIndentation = true;
- length += TAB_LENGTH - (length % TAB_LENGTH);
- break;
- case '\f': // form feed
- length = 0;
- break;
- }
- }
-
- if (this.wasTabIndentation && this.wasSpaceIndentation)
- {
- if (!this.wasIndentationMixedWithSpacesAndTabs)
- {
- this.wasIndentationMixedWithSpacesAndTabs = true;
- length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent
- }
- }
- return length;
- }
-
- private void ReportLexerError(string errMsg)
- {
- this.ErrorListenerDispatch.SyntaxError(this.ErrorOutput, this, this.curToken.Type, this.curToken.Line, this.curToken.Column, " LEXER" + PythonLexerBase.ERR_TXT + errMsg, null);
- }
-
- private void ReportError(string errMsg)
- {
- this.ReportLexerError(errMsg);
-
- // the ERRORTOKEN will raise an error in the parser
- this.CreateAndAddPendingToken(PythonLexer.ERRORTOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken);
- }
-}
diff --git a/python/python3_13/Java/PythonLexerBase.java b/python/python3_13/Java/PythonLexerBase.java
deleted file mode 100644
index ab5eb88751..0000000000
--- a/python/python3_13/Java/PythonLexerBase.java
+++ /dev/null
@@ -1,684 +0,0 @@
-/*
-The MIT License (MIT)
-Copyright (c) 2021 Robert Einhorn
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
- */
-
-/*
- *
- * Project : Python Indent/Dedent handler for ANTLR4 grammars
- *
- * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com
- *
- */
-
-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import org.antlr.v4.runtime.*;
-
-public abstract class PythonLexerBase extends Lexer {
- // A stack that keeps track of the indentation lengths
- private Deque indentLengthStack;
- // A list where tokens are waiting to be loaded into the token stream
- private Deque pendingTokens;
-
- // last pending token type
- private int previousPendingTokenType;
- private int lastPendingTokenTypeFromDefaultChannel;
-
- // The amount of opened parentheses, square brackets or curly braces
- private int opened;
- // The amount of opened parentheses and square brackets in the current lexer mode
- private Deque paren_or_bracket_openedStack;
- // A stack that stores expression(s) between braces in fstring
- private Deque braceExpressionStack;
- private String prevBraceExpression;
-
- // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime)
- private int curLexerMode;
- // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime)
- private Deque lexerModeStack;
-
- private boolean wasSpaceIndentation;
- private boolean wasTabIndentation;
- private boolean wasIndentationMixedWithSpacesAndTabs;
-
- private Token curToken; // current (under processing) token
- private Token ffgToken; // following (look ahead) token
-
- private final int INVALID_LENGTH = -1;
- private final String ERR_TXT = " ERROR: ";
-
- protected PythonLexerBase(CharStream input) {
- super(input);
- this.init();
- }
-
- @Override
- public Token nextToken() { // reading the input stream until a return EOF
- this.checkNextToken();
- return this.pendingTokens.pollFirst(); // add the queued token to the token stream
- }
-
- @Override
- public void reset() {
- this.init();
- super.reset();
- }
-
- private void init() {
- this.indentLengthStack = new ArrayDeque<>();
- this.pendingTokens = new ArrayDeque<>();
- this.previousPendingTokenType = 0;
- this.lastPendingTokenTypeFromDefaultChannel = 0;
- this.opened = 0;
- this.paren_or_bracket_openedStack = new ArrayDeque<>();
- this.braceExpressionStack = new ArrayDeque<>();
- this.prevBraceExpression = "";
- this.curLexerMode = 0;
- this.lexerModeStack = new ArrayDeque<>();
- this.wasSpaceIndentation = false;
- this.wasTabIndentation = false;
- this.wasIndentationMixedWithSpacesAndTabs = false;
- this.curToken = null;
- this.ffgToken = null;
- }
-
- private void checkNextToken() {
- if (this.previousPendingTokenType == Token.EOF)
- return;
-
- if (this.indentLengthStack.isEmpty()) { // We're at the first token
- this.insertENCODINGtoken();
- this.setCurrentAndFollowingTokens();
- this.handleStartOfInput();
- } else {
- this.setCurrentAndFollowingTokens();
- }
-
- switch (this.curToken.getType()) {
- case PythonLexer.NEWLINE:
- this.handleNEWLINEtoken();
- break;
- case PythonLexer.LPAR:
- case PythonLexer.LSQB:
- case PythonLexer.LBRACE:
- this.opened++;
- this.addPendingToken(this.curToken);
- break;
- case PythonLexer.RPAR:
- case PythonLexer.RSQB:
- case PythonLexer.RBRACE:
- this.opened--;
- this.addPendingToken(this.curToken);
- break;
- case PythonLexer.FSTRING_MIDDLE:
- this.handleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field
- this.addPendingToken(this.curToken);
- break;
- case PythonLexer.COLONEQUAL:
- this.handleCOLONEQUALtokenInFString();
- break;
- case PythonLexer.ERRORTOKEN:
- this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'");
- this.addPendingToken(this.curToken);
- break;
- case Token.EOF:
- this.handleEOFtoken();
- break;
- default:
- this.addPendingToken(this.curToken);
- }
- this.handleFORMAT_SPECIFICATION_MODE();
- }
-
- private void setCurrentAndFollowingTokens() {
- this.curToken = this.ffgToken == null ?
- super.nextToken() :
- this.ffgToken;
-
- this.checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)!
-
- this.ffgToken = this.curToken.getType() == Token.EOF ?
- this.curToken :
- super.nextToken();
- }
-
- private void insertENCODINGtoken() { // https://peps.python.org/pep-0263/
- StringBuilder lineBuilder = new StringBuilder();
- String encodingName = "";
- int lineCount = 0;
- final Pattern ws_commentPattern = Pattern.compile("^[ \\t\\f]*(#.*)?$");
- final CharStream charStream = this.getInputStream();
- final int size = charStream.size();
-
- charStream.seek(0);
- for (int i = 0; i < size; i++) {
- char c = (char) charStream.LA(i + 1);
- lineBuilder.append(c);
-
- if (c == '\n' || i == size - 1) {
- String line = lineBuilder.toString().replace("\r", "").replace("\n", "");
- if (ws_commentPattern.matcher(line).find()) { // WS* + COMMENT? found
- encodingName = getEncodingName(line);
- if (!encodingName.isEmpty()) {
- break; // encoding found
- }
- } else {
- break; // statement or backslash found (line is not empty, not whitespace(s), not comment)
- }
-
- lineCount++;
- if (lineCount >= 2) {
- break; // check only the first two lines
- }
- lineBuilder = new StringBuilder();
- }
- }
-
- if (encodingName.isEmpty()) {
- encodingName = "utf-8"; // default Python source code encoding
- }
-
- final CommonToken encodingToken = new CommonToken(PythonLexer.ENCODING, encodingName);
- encodingToken.setChannel(Token.HIDDEN_CHANNEL);
- this.addPendingToken(encodingToken);
- }
-
- private String getEncodingName(final String commentText) { // https://peps.python.org/pep-0263/#defining-the-encoding
- final Pattern encodingCommentPattern = Pattern.compile("^[ \\t\\f]*#.*?coding[:=][ \\t]*([-_.a-zA-Z0-9]+)");
- final Matcher matcher = encodingCommentPattern.matcher(commentText);
- return matcher.find() ? matcher.group(1) : "";
- }
-
- // initialize the indentLengthStack
- // hide the leading NEWLINE token(s)
- // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
- // insert a leading INDENT token if necessary
- private void handleStartOfInput() {
- // initialize the stack with a default 0 indentation length
- this.indentLengthStack.push(0); // this will never be popped off
- while (this.curToken.getType() != Token.EOF) {
- if (this.curToken.getChannel() == Token.DEFAULT_CHANNEL) {
- if (this.curToken.getType() == PythonLexer.NEWLINE) {
- // all the NEWLINE tokens must be ignored before the first statement
- this.hideAndAddPendingToken(this.curToken);
- } else { // We're at the first statement
- this.insertLeadingIndentToken();
- return; // continue the processing of the current token with checkNextToken()
- }
- } else {
- this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
- }
- this.setCurrentAndFollowingTokens();
- }
- // continue the processing of the EOF token with checkNextToken()
- }
-
- private void insertLeadingIndentToken() {
- if (this.previousPendingTokenType == PythonLexer.WS) {
- Token prevToken = this.pendingTokens.peekLast(); // WS token
- if (this.getIndentationLength(prevToken.getText()) != 0) { // there is an "indentation" before the first statement
- final String errMsg = "first statement indented";
- this.reportLexerError(errMsg);
- // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken);
- }
- }
- }
-
- private void handleNEWLINEtoken() {
- if (!this.lexerModeStack.isEmpty()) { // for multi line fstring literals
- this.addPendingToken(this.curToken);
- } else if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
- this.hideAndAddPendingToken(this.curToken);
- } else {
- final Token nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
- final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS;
- if (isLookingAhead) {
- this.setCurrentAndFollowingTokens(); // set the next two tokens
- }
-
- switch (this.ffgToken.getType()) {
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
- this.hideAndAddPendingToken(nlToken);
- if (isLookingAhead) {
- this.addPendingToken(this.curToken); // WS token
- }
- break;
- default:
- this.addPendingToken(nlToken);
- if (isLookingAhead) { // We're on a whitespace(s) followed by a statement
- final int indentationLength = this.ffgToken.getType() == Token.EOF ?
- 0 :
- this.getIndentationLength(this.curToken.getText());
-
- if (indentationLength != this.INVALID_LENGTH) {
- this.addPendingToken(this.curToken); // WS token
- this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
- } else {
- this.reportError("inconsistent use of tabs and spaces in indentation");
- }
- } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
- this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
- }
- }
- }
- }
-
- private void insertIndentOrDedentToken(final int indentLength) {
- int prevIndentLength = this.indentLengthStack.peek();
- if (indentLength > prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
- this.indentLengthStack.push(indentLength);
- } else {
- while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
- this.indentLengthStack.pop();
- prevIndentLength = this.indentLengthStack.peek();
- if (indentLength <= prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
- } else {
- this.reportError("inconsistent dedent");
- }
- }
- }
- }
-
- private void checkCurToken() {
- switch (this.curToken.getType()) {
- case PythonLexer.FSTRING_START:
- this.setLexerModeByFSTRING_STARTtoken();
- return;
- case PythonLexer.FSTRING_MIDDLE:
- this.handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field
- if (this.curToken.getType() == PythonLexer.FSTRING_MIDDLE)
- return; // No curToken exchange happened
- break;
- case PythonLexer.FSTRING_END:
- this.popLexerMode();
- return;
- default:
- if (this.lexerModeStack.isEmpty())
- return; // Not in fstring mode
- }
-
- switch (this.curToken.getType()) { // the following tokens can only come from default mode (after an LBRACE in fstring)
- case PythonLexer.NEWLINE:
- // append the current brace expression with the current newline
- this.appendToBraceExpression(this.curToken.getText());
- final CommonToken ctkn = new CommonToken(this.curToken);
- ctkn.setChannel(Token.HIDDEN_CHANNEL);
- this.curToken = ctkn;
- break;
- case PythonLexer.LBRACE:
- // the outermost brace expression cannot be a dictionary comprehension or a set comprehension
- this.braceExpressionStack.push("{");
- this.paren_or_bracket_openedStack.push(0);
- this.pushLexerMode(Lexer.DEFAULT_MODE);
- break;
- case PythonLexer.LPAR:
- case PythonLexer.LSQB:
- // append the current brace expression with a "(" or a "["
- this.appendToBraceExpression(this.curToken.getText());
- // https://peps.python.org/pep-0498/#lambdas-inside-expressions
- this.incrementBraceStack();
- break;
- case PythonLexer.RPAR:
- case PythonLexer.RSQB:
- // append the current brace expression with a ")" or a "]"
- this.appendToBraceExpression(this.curToken.getText());
- this.decrementBraceStack();
- break;
- case PythonLexer.COLON:
- case PythonLexer.COLONEQUAL:
- // append the current brace expression with a ":" or a ":="
- this.appendToBraceExpression(this.curToken.getText());
- this.setLexerModeByCOLONorCOLONEQUALtoken();
- break;
- case PythonLexer.RBRACE:
- this.setLexerModeAfterRBRACEtoken();
- break;
- default:
- // append the current brace expression with the current token text
- this.appendToBraceExpression(this.curToken.getText());
- }
- }
-
- private void appendToBraceExpression(String text) {
- this.braceExpressionStack.push(this.braceExpressionStack.pop() + text);
- }
-
- private void incrementBraceStack() { // increment the last element (peek() + 1)
- this.paren_or_bracket_openedStack.push(this.paren_or_bracket_openedStack.pop() + 1);
- }
-
- private void decrementBraceStack() { // decrement the last element (peek() - 1)
- this.paren_or_bracket_openedStack.push(this.paren_or_bracket_openedStack.pop() - 1);
- }
-
- private void setLexerModeAfterRBRACEtoken() {
- switch (this.curLexerMode) {
- case Lexer.DEFAULT_MODE:
- this.popLexerMode();
- this.popByBRACE();
- break;
- case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE:
- this.popLexerMode();
- this.popLexerMode();
- this.popByBRACE();
- break;
- default:
- this.reportLexerError("f-string: single '}' is not allowed");
- }
- }
-
- private void setLexerModeByFSTRING_STARTtoken() {
- final String text = this.curToken.getText().toLowerCase();
- Map modeMap = new HashMap<>();
- modeMap.put("f'", PythonLexer.SQ1__FSTRING_MODE);
- modeMap.put("rf'", PythonLexer.SQ1R_FSTRING_MODE);
- modeMap.put("fr'", PythonLexer.SQ1R_FSTRING_MODE);
- modeMap.put("f\"", PythonLexer.DQ1__FSTRING_MODE);
- modeMap.put("rf\"", PythonLexer.DQ1R_FSTRING_MODE);
- modeMap.put("fr\"", PythonLexer.DQ1R_FSTRING_MODE);
- modeMap.put("f'''", PythonLexer.SQ3__FSTRING_MODE);
- modeMap.put("rf'''", PythonLexer.SQ3R_FSTRING_MODE);
- modeMap.put("fr'''", PythonLexer.SQ3R_FSTRING_MODE);
- modeMap.put("f\"\"\"", PythonLexer.DQ3__FSTRING_MODE);
- modeMap.put("rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE);
- modeMap.put("fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE);
-
- Integer mode = modeMap.get(text);
- if (mode != null) {
- this.pushLexerMode(mode);
- }
- }
-
- private void setLexerModeByCOLONorCOLONEQUALtoken() {
- if (this.paren_or_bracket_openedStack.peek() == 0) {
- // COLONEQUAL token will be replaced with a COLON token in checkNextToken()
- switch (this.lexerModeStack.peek()) { // check the previous lexer mode (the current is DEFAULT_MODE)
- case PythonLexer.SQ1__FSTRING_MODE:
- case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ1R_FSTRING_MODE:
- case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ1__FSTRING_MODE:
- case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ1R_FSTRING_MODE:
- case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ3__FSTRING_MODE:
- case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ3R_FSTRING_MODE:
- case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ3__FSTRING_MODE:
- case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ3R_FSTRING_MODE:
- case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- }
- }
- }
-
- private void popByBRACE() {
- this.paren_or_bracket_openedStack.pop();
- this.prevBraceExpression = this.braceExpressionStack.pop() + "}";
- if (!this.braceExpressionStack.isEmpty()) {
- // append the current brace expression with the previous brace expression
- this.braceExpressionStack.push(this.braceExpressionStack.pop() + this.prevBraceExpression);
- }
-
- }
-
- private void handleFSTRING_MIDDLEtokenWithDoubleBrace() {
- // replace the trailing double brace with a single brace and insert a hidden brace token
- switch (this.getLastTwoCharsOfTheCurTokenText()) {
- case "{{":
- this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL);
- break;
- case "}}":
- this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL);
- break;
- }
- }
-
- private void handleFSTRING_MIDDLEtokenWithQuoteAndLBrace() {
- // replace the trailing quote + left_brace with a quote and insert an LBRACE token
- // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
- switch (this.getLastTwoCharsOfTheCurTokenText()) {
- case "\"{":
- case "'{":
- case "\\{":
- this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL);
- break;
- }
- }
-
- private String getLastTwoCharsOfTheCurTokenText() {
- final String curTokenText = this.curToken.getText();
- return curTokenText.length() >= 2 ? curTokenText.substring(curTokenText.length() - 2) : curTokenText;
- }
-
- private void trimLastCharAddPendingTokenSetCurToken(final int type, final String text, final int channel) {
- // trim the last char and add the modified curToken to the pendingTokens stack
- final String curTokenText = this.curToken.getText();
- final String tokenTextWithoutLastChar = curTokenText.substring(0, curTokenText.length() - 1);
- final CommonToken ctkn = new CommonToken(this.curToken);
- ctkn.setText(tokenTextWithoutLastChar);
- ctkn.setStopIndex(ctkn.getStopIndex() - 1);
- this.addPendingToken(ctkn);
-
- this.createNewCurToken(type, text, channel); // set curToken
- }
-
- private void handleCOLONEQUALtokenInFString() {
- if (!this.lexerModeStack.isEmpty() &&
- this.paren_or_bracket_openedStack.peek() == 0) {
-
- // In fstring a colonequal (walrus operator) can only be used in parentheses
- // Not in parentheses, replace COLONEQUAL token with COLON as format specifier
- // and insert the equal symbol to the following FSTRING_MIDDLE token
- CommonToken ctkn = new CommonToken(this.curToken);
- ctkn.setType(PythonLexer.COLON);
- ctkn.setText(":");
- ctkn.setStopIndex(ctkn.getStartIndex());
- this.curToken = ctkn;
- if (this.ffgToken.getType() == PythonLexer.FSTRING_MIDDLE) {
- ctkn = new CommonToken(this.ffgToken);
- ctkn.setText("=" + ctkn.getText());
- ctkn.setStartIndex(ctkn.getStartIndex() - 1);
- ctkn.setCharPositionInLine(ctkn.getCharPositionInLine() - 1);
- this.ffgToken = ctkn;
- } else {
- this.addPendingToken(this.curToken);
- this.createNewCurToken(PythonLexer.FSTRING_MIDDLE, "=", Token.DEFAULT_CHANNEL);
- }
- }
- this.addPendingToken(this.curToken);
- }
-
- private void createNewCurToken(final int type, final String text, final int channel) {
- final CommonToken ctkn = new CommonToken(this.curToken);
- ctkn.setType(type);
- ctkn.setText(text);
- ctkn.setChannel(channel);
- ctkn.setCharPositionInLine(ctkn.getCharPositionInLine() + 1);
- ctkn.setStartIndex(ctkn.getStartIndex() + 1);
- ctkn.setStopIndex(ctkn.getStartIndex());
- this.curToken = ctkn;
- }
-
- private void pushLexerMode(final int mode) {
- this.pushMode(mode);
- this.lexerModeStack.push(this.curLexerMode);
- this.curLexerMode = mode;
- }
-
- private void popLexerMode() {
- this.popMode();
- this.curLexerMode = this.lexerModeStack.pop();
- }
-
- private void handleFORMAT_SPECIFICATION_MODE() {
- if (!this.lexerModeStack.isEmpty() &&
- this.ffgToken.getType() == PythonLexer.RBRACE) {
-
- // insert an empty FSTRING_MIDDLE token instead of the missing format specification
- switch (this.curToken.getType()) {
- case PythonLexer.COLON:
- this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken);
- break;
- case PythonLexer.RBRACE:
- // only if the previous brace expression is not a dictionary comprehension or set comprehension
- if (!isDictionaryComprehensionOrSetComprehension(this.prevBraceExpression)) {
- this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken);
- }
- break;
- }
- }
- }
-
- private boolean isDictionaryComprehensionOrSetComprehension(final String code) {
- final CharStream inputStream = CharStreams.fromString(code);
- final PythonLexer lexer = new PythonLexer(inputStream);
- final CommonTokenStream tokenStream = new CommonTokenStream(lexer);
- PythonParser parser = new PythonParser(tokenStream);
-
- // Disable error listeners to suppress console output
- lexer.removeErrorListeners();
- parser.removeErrorListeners();
-
- parser.dictcomp(); // Try parsing as dictionary comprehension
- if (parser.getNumberOfSyntaxErrors() == 0)
- return true;
-
- parser = new PythonParser(tokenStream);
- tokenStream.seek(0);
- parser.removeErrorListeners();
- parser.setcomp(); // Try parsing as set comprehension
- return parser.getNumberOfSyntaxErrors() == 0;
- }
-
- private void insertTrailingTokens() {
- switch (this.lastPendingTokenTypeFromDefaultChannel) {
- case PythonLexer.NEWLINE:
- case PythonLexer.DEDENT:
- break; // no trailing NEWLINE token is needed
- default: // insert an extra trailing NEWLINE token that serves as the end of the last statement
- this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // ffgToken is EOF
- }
- this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
- }
-
- private void handleEOFtoken() {
- if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
- // there was a statement in the input (leading NEWLINE tokens are hidden)
- this.insertTrailingTokens();
- }
- this.addPendingToken(this.curToken);
- }
-
- private void hideAndAddPendingToken(final Token tkn) {
- final CommonToken ctkn = new CommonToken(tkn);
- ctkn.setChannel(Token.HIDDEN_CHANNEL);
- this.addPendingToken(ctkn);
- }
-
- private void createAndAddPendingToken(final int ttype, final int channel, final String text, final Token sampleToken) {
- final CommonToken ctkn = new CommonToken(sampleToken);
- ctkn.setType(ttype);
- ctkn.setChannel(channel);
- ctkn.setStopIndex(sampleToken.getStartIndex() - 1);
- ctkn.setText(text == null ?
- "<" + this.getVocabulary().getDisplayName(ttype) + ">" :
- text);
-
- this.addPendingToken(ctkn);
- }
-
- private void addPendingToken(final Token tkn) {
- // save the last pending token type because the pendingTokens list can be empty by the nextToken()
- this.previousPendingTokenType = tkn.getType();
- if (tkn.getChannel() == Token.DEFAULT_CHANNEL) {
- this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
- }
- this.pendingTokens.addLast(tkn);
- }
-
- private int getIndentationLength(final String indentText) { // the indentText may contain spaces, tabs or form feeds
- final int TAB_LENGTH = 8; // the standard number of spaces to replace a tab with spaces
- int length = 0;
- for (char ch : indentText.toCharArray()) {
- switch (ch) {
- case ' ':
- this.wasSpaceIndentation = true;
- length += 1;
- break;
- case '\t':
- this.wasTabIndentation = true;
- length += TAB_LENGTH - (length % TAB_LENGTH);
- break;
- case '\f': // form feed
- length = 0;
- break;
- }
- }
-
- if (this.wasTabIndentation && this.wasSpaceIndentation) {
- if (!(this.wasIndentationMixedWithSpacesAndTabs)) {
- this.wasIndentationMixedWithSpacesAndTabs = true;
- length = this.INVALID_LENGTH; // only for the first inconsistent indent
- }
- }
- return length;
- }
-
- private void reportLexerError(final String errMsg) {
- this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.getLine(), this.curToken.getCharPositionInLine(), " LEXER" + this.ERR_TXT + errMsg, null);
- }
-
- private void reportError(final String errMsg) {
- this.reportLexerError(errMsg);
-
- // the ERRORTOKEN will raise an error in the parser
- this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken);
- }
-}
diff --git a/python/python3_13/JavaScript/PythonLexerBase.js b/python/python3_13/JavaScript/PythonLexerBase.js
deleted file mode 100644
index 5c08004f40..0000000000
--- a/python/python3_13/JavaScript/PythonLexerBase.js
+++ /dev/null
@@ -1,676 +0,0 @@
-/*
-The MIT License (MIT)
-Copyright (c) 2021 Robert Einhorn
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
- */
-
-/*
- *
- * Project : Python Indent/Dedent handler for ANTLR4 grammars
- *
- * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com
- *
- */
-
-import { CharStreams, CommonTokenStream, Token, CommonToken, Lexer } from "antlr4";
-import PythonLexer from "./PythonLexer.js";
-import PythonParser from "./PythonParser.js";
-
-export default class PythonLexerBase extends Lexer {
- constructor(input) {
- super(input);
-
- // A stack that keeps track of the indentation lengths
- this.indentLengthStack;
- // A list where tokens are waiting to be loaded into the token stream
- this.pendingTokens;
-
- // last pending token types
- this.previousPendingTokenType;
- this.lastPendingTokenTypeFromDefaultChannel;
-
- // The amount of opened parentheses, square brackets or curly braces
- this.opened;
- // The amount of opened parentheses and square brackets in the current lexer mode
- this.paren_or_bracket_openedStack;
- // A stack that stores expression(s) between braces in fstring
- this.braceExpressionStack;
- this.prevBraceExpression;
-
- // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime)
- this.curLexerMode;
- // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime)
- this.lexerModeStack;
-
- this.wasSpaceIndentation;
- this.wasTabIndentation;
- this.wasIndentationMixedWithSpacesAndTabs;
-
- this.curToken; // current (under processing) token
- this.ffgToken; // following (look ahead) token
-
- this.#init();
- }
-
- get #INVALID_LENGTH() { return -1; }
- get #ERR_TXT() { return " ERROR: "; }
-
- nextToken() { // reading the input stream until a return EOF
- this.#checkNextToken();
- return this.pendingTokens.shift() /* stack pollFirst() */; // add the queued token to the token stream
- }
-
- reset() {
- this.#init();
- super.reset();
- }
-
- #init() {
- this.indentLengthStack = [];
- this.pendingTokens = [];
- this.previousPendingTokenType = 0;
- this.lastPendingTokenTypeFromDefaultChannel = 0;
- this.opened = 0;
- this.paren_or_bracket_openedStack = [];
- this.braceExpressionStack = [];
- this.prevBraceExpression = "";
- this.curLexerMode = 0;
- this.lexerModeStack = [];
- this.wasSpaceIndentation = false;
- this.wasTabIndentation = false;
- this.wasIndentationMixedWithSpacesAndTabs = false;
- this.curToken = null;
- this.ffgToken = null;
- }
-
- #checkNextToken() {
- if (this.previousPendingTokenType === Token.EOF)
- return;
-
- if (this.indentLengthStack.length === 0) { // We're at the first token
- this.#insertENCODINGtoken();
- this.#setCurrentAndFollowingTokens();
- this.#handleStartOfInput();
- } else {
- this.#setCurrentAndFollowingTokens();
- }
-
- switch (this.curToken.type) {
- case PythonLexer.NEWLINE:
- this.#handleNEWLINEtoken();
- break;
- case PythonLexer.LPAR:
- case PythonLexer.LSQB:
- case PythonLexer.LBRACE:
- this.opened++;
- this.#addPendingToken(this.curToken);
- break;
- case PythonLexer.RPAR:
- case PythonLexer.RSQB:
- case PythonLexer.RBRACE:
- this.opened--;
- this.#addPendingToken(this.curToken);
- break;
- case PythonLexer.FSTRING_MIDDLE:
- this.#handleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field
- this.#addPendingToken(this.curToken);
- break;
- case PythonLexer.COLONEQUAL:
- this.#handleCOLONEQUALtokenInFString();
- break;
- case PythonLexer.ERRORTOKEN:
- this.#reportLexerError(`token recognition error at: '${this.curToken.text}'`);
- this.#addPendingToken(this.curToken);
- break;
- case Token.EOF:
- this.#handleEOFtoken();
- break;
- default:
- this.#addPendingToken(this.curToken);
- }
- this.#handleFORMAT_SPECIFICATION_MODE();
- }
-
- #setCurrentAndFollowingTokens() {
- this.curToken = this.ffgToken == undefined ?
- super.nextToken() :
- this.ffgToken;
-
- this.#checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)!
-
- this.ffgToken = this.curToken.type === Token.EOF ?
- this.curToken :
- super.nextToken();
- }
-
- #insertENCODINGtoken() {
- let lineBuilder = [];
- let encodingName = "";
- let lineCount = 0;
- const ws_commentPattern = /^[ \t\f]*(#.*)?$/;
- const inputStream = this.inputStream;
- const size = inputStream.size;
-
- inputStream.seek(0);
- for (let i = 0; i < size; i++) {
- let c = String.fromCharCode(inputStream.LA(i + 1));
- lineBuilder.push(c);
-
- if (c == '\n' || i == size - 1) {
- let line = lineBuilder.join("").replace("\r", "").replace("\n", "");
- if (ws_commentPattern.test(line)) { // WS* + COMMENT? found
- encodingName = this.#getEncodingName(line);
- if (encodingName !== "") {
- break; // encoding found
- }
- } else {
- break; // statement or backslash found (line is not empty, not whitespace, not comment)
- }
-
- lineCount++;
- if (lineCount >= 2) {
- break; // check only the first two lines
- }
- lineBuilder = [];
- }
- }
-
- if (encodingName === "") {
- encodingName = "utf-8"; // default Python source code encoding
- }
-
- const encodingToken = new CommonToken([null, null], PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, 0, 0);
- encodingToken.text = encodingName;
- encodingToken.line = 0;
- encodingToken.column = -1;
- this.#addPendingToken(encodingToken);
- }
-
- #getEncodingName(commentText) { // https://peps.python.org/pep-0263/#defining-the-encoding
- const encodingCommentPattern = /^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)/;
- const match = commentText.match(encodingCommentPattern);
- return match ? match[1] : "";
- }
-
- // initialize the _indentLengthStack
- // hide the leading NEWLINE token(s)
- // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
- // insert a leading INDENT token if necessary
- #handleStartOfInput() {
- // initialize the stack with a default 0 indentation length
- this.indentLengthStack.push(0); // this will never be popped off
- while (this.curToken.type !== Token.EOF) {
- if (this.curToken.channel === Token.DEFAULT_CHANNEL) {
- if (this.curToken.type === PythonLexer.NEWLINE) {
- // all the NEWLINE tokens must be ignored before the first statement
- this.#hideAndAddPendingToken(this.curToken);
- } else { // We're at the first statement
- this.#insertLeadingIndentToken();
- return; // continue the processing of the current token with #checkNextToken()
- }
- } else {
- this.#addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
- }
- this.#setCurrentAndFollowingTokens();
- } // continue the processing of the EOF token with #checkNextToken()
- }
-
- #insertLeadingIndentToken() {
- if (this.previousPendingTokenType === PythonLexer.WS) {
- const prevToken = this.pendingTokens.at(- 1); /* stack peekLast() */ // WS token
- if (this.#getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
- const errMsg = "first statement indented";
- this.#reportLexerError(errMsg);
- // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.curToken);
- }
- }
- }
-
- #handleNEWLINEtoken() {
- if (this.lexerModeStack.length > 0) {
- this.#addPendingToken(this.curToken);
- } else if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
- this.#hideAndAddPendingToken(this.curToken);
- } else {
- const nlToken = this.curToken.clone(); // save the current NEWLINE token
- const isLookingAhead = this.ffgToken.type === PythonLexer.WS;
- if (isLookingAhead) {
- this.#setCurrentAndFollowingTokens(); // set the next two tokens
- }
-
- switch (this.ffgToken.type) {
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
- this.#hideAndAddPendingToken(nlToken);
- if (isLookingAhead) {
- this.#addPendingToken(this.curToken); // WS token
- }
- break;
- default:
- this.#addPendingToken(nlToken);
- if (isLookingAhead) { // We're on a whitespace(s) followed by a statement
- const indentationLength = this.ffgToken.type === Token.EOF ?
- 0 :
- this.#getIndentationLength(this.curToken.text);
-
- if (indentationLength !== this.#INVALID_LENGTH) {
- this.#addPendingToken(this.curToken); // WS token
- this.#insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
- } else {
- this.#reportError("inconsistent use of tabs and spaces in indentation");
- }
- } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
- this.#insertIndentOrDedentToken(0); // may insert DEDENT token(s)
- }
- }
- }
- }
-
- #insertIndentOrDedentToken(curIndentLength) {
- let prevIndentLength = this.indentLengthStack.at(-1) /* peek() */;
- if (curIndentLength > prevIndentLength) {
- this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
- this.indentLengthStack.push(curIndentLength);
- } else {
- while (curIndentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
- this.indentLengthStack.pop();
- prevIndentLength = this.indentLengthStack.at(-1) /* peek() */;
- if (curIndentLength <= prevIndentLength) {
- this.#createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
- } else {
- this.#reportError("inconsistent dedent");
- }
- }
- }
- }
-
- #checkCurToken() {
- switch (this.curToken.type) {
- case PythonLexer.FSTRING_START:
- this.#setLexerModeByFSTRING_STARTtoken();
- return;
- case PythonLexer.FSTRING_MIDDLE:
- this.#handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field
- if (this.curToken.type === PythonLexer.FSTRING_MIDDLE) {
- return; // No curToken exchange happened
- }
- break;
- case PythonLexer.FSTRING_END:
- this.#popLexerMode();
- return;
- default:
- if (this.lexerModeStack.length === 0) {
- return; // Not in fstring mode
- }
- }
-
- switch (this.curToken.type) { // the following tokens can only come from default mode (after an LBRACE in fstring)
- case PythonLexer.NEWLINE:
- // append the current brace expression with the current newline
- this.#appendToBraceExpression(this.curToken.text)
- this.curToken.channel = Token.HIDDEN_CHANNEL;
- break;
- case PythonLexer.LBRACE:
- // the outermost brace expression cannot be a dictionary comprehension or a set comprehension
- this.braceExpressionStack.push("{");
- this.paren_or_bracket_openedStack.push(0);
- this.#pushLexerMode(Lexer.DEFAULT_MODE);
- break;
- case PythonLexer.LPAR:
- case PythonLexer.LSQB:
- // append the current brace expression with a "(" or a "["
- this.#appendToBraceExpression(this.curToken.text)
- // https://peps.python.org/pep-0498/#lambdas-inside-expressions
- this.#incrementBraceStack();
- break;
- case PythonLexer.RPAR:
- case PythonLexer.RSQB:
- // append the current brace expression with a ")" or a "]"
- this.#appendToBraceExpression(this.curToken.text)
- this.#decrementBraceStack();
- break;
- case PythonLexer.COLON:
- case PythonLexer.COLONEQUAL:
- // append the current brace expression with a ":" or a ":="
- this.#appendToBraceExpression(this.curToken.text)
- this.#setLexerModeByCOLONorCOLONEQUALtoken();
- break;
- case PythonLexer.RBRACE:
- this.#setLexerModeAfterRBRACEtoken();
- break;
- default:
- // append the current brace expression with the current token text
- this.#appendToBraceExpression(this.curToken.text)
- }
- }
-
- #appendToBraceExpression(text) {
- this.braceExpressionStack[this.braceExpressionStack.length - 1] += text;
- }
-
- #incrementBraceStack() { // increment the last element (peek() + 1)
- this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1]++;
- }
-
- #decrementBraceStack() { // decrement the last element (peek() - 1)
- this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1]--;
- }
-
- #setLexerModeAfterRBRACEtoken() {
- switch (this.curLexerMode) {
- case Lexer.DEFAULT_MODE:
- this.#popLexerMode();
- this.#popByBRACE();
- break;
- case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE:
- this.#popLexerMode();
- this.#popLexerMode();
- this.#popByBRACE();
- break;
- default:
- this.#reportLexerError("f-string: single '}' is not allowed");
- }
- }
-
- #setLexerModeByFSTRING_STARTtoken() {
- const text = this.curToken.text.toLowerCase();
- const modeMap = {
- "f'": PythonLexer.SQ1__FSTRING_MODE,
- "rf'": PythonLexer.SQ1R_FSTRING_MODE,
- "fr'": PythonLexer.SQ1R_FSTRING_MODE,
- 'f"': PythonLexer.DQ1__FSTRING_MODE,
- 'rf"': PythonLexer.DQ1R_FSTRING_MODE,
- 'fr"': PythonLexer.DQ1R_FSTRING_MODE,
- "f'''": PythonLexer.SQ3__FSTRING_MODE,
- "rf'''": PythonLexer.SQ3R_FSTRING_MODE,
- "fr'''": PythonLexer.SQ3R_FSTRING_MODE,
- 'f"""': PythonLexer.DQ3__FSTRING_MODE,
- 'rf"""': PythonLexer.DQ3R_FSTRING_MODE,
- 'fr"""': PythonLexer.DQ3R_FSTRING_MODE,
- };
- const mode = modeMap[text];
- if (mode !== undefined) {
- this.#pushLexerMode(mode);
- }
- }
-
- #setLexerModeByCOLONorCOLONEQUALtoken() {
- if (this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1] === 0) { // stack peek == 0
- const previousMode = this.lexerModeStack[this.lexerModeStack.length - 1]; // stack peek
- switch (previousMode) { // check the previous lexer mode (the current is DEFAULT_MODE)
- case PythonLexer.SQ1__FSTRING_MODE:
- case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE:
- this.#pushLexerMode(PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ1R_FSTRING_MODE:
- case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE:
- this.#pushLexerMode(PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ1__FSTRING_MODE:
- case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE:
- this.#pushLexerMode(PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ1R_FSTRING_MODE:
- case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE:
- this.#pushLexerMode(PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ3__FSTRING_MODE:
- case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE:
- this.#pushLexerMode(PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ3R_FSTRING_MODE:
- case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE:
- this.#pushLexerMode(PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ3__FSTRING_MODE:
- case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE:
- this.#pushLexerMode(PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ3R_FSTRING_MODE:
- case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE:
- this.#pushLexerMode(PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- }
- }
- }
-
- #popByBRACE() {
- this.paren_or_bracket_openedStack.pop();
- this.prevBraceExpression = this.braceExpressionStack.pop() + "}";
- if (this.braceExpressionStack.length > 0) {
- // append the current brace expression with the previous brace expression
- this.braceExpressionStack[this.braceExpressionStack.length - 1] += this.prevBraceExpression;
- }
- }
-
- #handleFSTRING_MIDDLEtokenWithDoubleBrace() {
- // replace the trailing double brace with a single brace and insert a hidden brace token
- switch (this.#getLastTwoCharsOfTheCurTokenText()) {
- case "{{":
- this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL);
- break;
- case "}}":
- this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL);
- break;
- }
- }
-
- #handleFSTRING_MIDDLEtokenWithQuoteAndLBrace() {
- // replace the trailing quote + left_brace with a quote and insert an LBRACE token
- // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
- switch (this.#getLastTwoCharsOfTheCurTokenText()) {
- case "\"{":
- case "'{":
- case "\\{":
- this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL);
- break;
- }
- }
-
- #getLastTwoCharsOfTheCurTokenText() {
- return this.curToken.text.slice(-2);
- }
-
- #trimLastCharAddPendingTokenSetCurToken(type, text, channel) {
- // trim the last char and add the modified curToken to the pendingTokens stack
- const tokenTextWithoutLastChar = this.curToken.text.slice(0, -1);
- this.curToken.text = tokenTextWithoutLastChar;
- this.curToken.stop -= 1;
- this.#addPendingToken(this.curToken);
-
- this.#createNewCurToken(type, text, channel); // set curToken
- }
-
- #handleCOLONEQUALtokenInFString() {
- if (this.lexerModeStack.length > 0 &&
- this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1] === 0) { // stack peek == 0
-
- // In fstring a colonequal (walrus operator) can only be used in parentheses
- // Not in parentheses, replace COLONEQUAL token with COLON as format specifier
- // and insert the equal symbol to the following FSTRING_MIDDLE token
- this.curToken.type = PythonLexer.COLON;
- this.curToken.text = ":";
- this.curToken.stop = this.curToken.start;
-
- if (this.ffgToken.type === PythonLexer.FSTRING_MIDDLE) {
- this.ffgToken.text = "=" + this.ffgToken.text;
- this.ffgToken.start -= 1;
- this.ffgToken.column -= 1;
- } else {
- this.#addPendingToken(this.curToken);
- this.#createNewCurToken(PythonLexer.FSTRING_MIDDLE, "=", Token.DEFAULT_CHANNEL);
- }
- }
- this.#addPendingToken(this.curToken);
- }
-
- #createNewCurToken(type, text, channel) {
- const ctkn = this.curToken.clone();
- ctkn.type = type;
- ctkn.text = text;
- ctkn.channel = channel;
- ctkn.column += 1;
- ctkn.start += 1;
- ctkn.stop = ctkn.start;
- this.curToken = ctkn;
- }
-
- #pushLexerMode(mode) {
- this.pushMode(mode);
- this.lexerModeStack.push(this.curLexerMode);
- this.curLexerMode = mode;
- }
-
- #popLexerMode() {
- this.popMode();
- this.curLexerMode = this.lexerModeStack.pop();
- }
-
- #handleFORMAT_SPECIFICATION_MODE() {
- if (this.lexerModeStack.length > 0 &&
- this.ffgToken.type === PythonLexer.RBRACE) {
-
- // insert an empty FSTRING_MIDDLE token instead of the missing format specification
- switch (this.curToken.type) {
- case PythonLexer.COLON:
- this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken);
- break;
- case PythonLexer.RBRACE:
- // only if the previous brace expression is not a dictionary comprehension or set comprehension
- if (!this.#isDictionaryComprehensionOrSetComprehension(this.prevBraceExpression)) {
- this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken);
- }
- break;
- }
- }
- }
-
- #isDictionaryComprehensionOrSetComprehension(code) {
- const inputStream = CharStreams.fromString(code);
- const lexer = new PythonLexer(inputStream);
- const tokenStream = new CommonTokenStream(lexer);
- let parser = new PythonParser(tokenStream);
-
- // Disable error listeners to suppress console output
- lexer.removeErrorListeners();
- parser.removeErrorListeners();
-
- parser.dictcomp(); // Try parsing as dictionary comprehension
- if (parser.syntaxErrorsCount === 0)
- return true;
-
- parser = new PythonParser(tokenStream);
- tokenStream.seek(0);
- parser.removeErrorListeners();
- parser.setcomp(); // Try parsing as set comprehension
- return parser.syntaxErrorsCount === 0;
- }
-
- #insertTrailingTokens() {
- switch (this.lastPendingTokenTypeFromDefaultChannel) {
- case PythonLexer.NEWLINE:
- case PythonLexer.DEDENT:
- break; // no trailing NEWLINE token is needed
- default:
- // insert an extra trailing NEWLINE token that serves as the end of the last statement
- this.#createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF
- }
- this.#insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
- }
-
- #handleEOFtoken() {
- if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
- // there was a statement in the input (leading NEWLINE tokens are hidden)
- this.#insertTrailingTokens();
- }
- this.#addPendingToken(this.curToken);
- }
-
- #hideAndAddPendingToken(ctkn) {
- ctkn.channel = Token.HIDDEN_CHANNEL;
- this.#addPendingToken(ctkn);
- }
-
- #createAndAddPendingToken(type, channel, text, sampleToken) {
- const ctkn = sampleToken.clone();
- ctkn.type = type;
- ctkn.channel = channel;
- ctkn.stop = sampleToken.start - 1;
- ctkn.text = text == null ?
- `<${this.getSymbolicNames()[type]}>` :
- text;
-
- this.#addPendingToken(ctkn);
- }
-
- #addPendingToken(tkn) {
- // save the last pending token type because the _pendingTokens linked list can be empty by the nextToken()
- this.previousPendingTokenType = tkn.type;
- if (tkn.channel === Token.DEFAULT_CHANNEL) {
- this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
- }
- this.pendingTokens.push(tkn) /* .addLast(token) */;
- }
-
- #getIndentationLength(indentText) { // the indentText may contain spaces, tabs or form feeds
- const TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces
- let length = 0;
- for (let ch of indentText) {
- switch (ch) {
- case " ":
- this.wasSpaceIndentation = true;
- length += 1;
- break;
- case "\t":
- this.wasTabIndentation = true;
- length += TAB_LENGTH - (length % TAB_LENGTH);
- break;
- case "\f": // form feed
- length = 0;
- break;
- }
- }
-
- if (this.wasTabIndentation && this.wasSpaceIndentation) {
- if (!this.wasIndentationMixedWithSpacesAndTabs) {
- this.wasIndentationMixedWithSpacesAndTabs = true;
- length = this.#INVALID_LENGTH; // only for the first inconsistent indent
- }
- }
- return length;
- }
-
- #reportLexerError(errMsg) {
- this.getErrorListener().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.#ERR_TXT + errMsg, null);
- }
-
- #reportError(errMsg) {
- this.#reportLexerError(errMsg);
-
- // the ERRORTOKEN will raise an error in the parser
- this.#createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.ffgToken);
- }
-}
diff --git a/python/python3_13/Python3/PythonLexerBase.py b/python/python3_13/Python3/PythonLexerBase.py
deleted file mode 100644
index d3272163a9..0000000000
--- a/python/python3_13/Python3/PythonLexerBase.py
+++ /dev/null
@@ -1,557 +0,0 @@
-# The MIT License (MIT)
-# Copyright (c) 2021 Robert Einhorn
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-# Project : Python Indent/Dedent handler for ANTLR4 grammars
-#
-# Developed by : Robert Einhorn
-
-from typing import TextIO, Optional, List, Deque
-from antlr4 import InputStream, Lexer, Token
-from antlr4.Token import CommonToken
-import sys
-import re
-
-class PythonLexerBase(Lexer):
- def __init__(self, input: InputStream, output: TextIO = sys.stdout):
- super().__init__(input, output)
-
- # A stack that keeps track of the indentation lengths
- self.__indent_length_stack: List[int]
-
- # A list where tokens are waiting to be loaded into the token stream
- self.__pending_tokens: Deque[CommonToken]
-
- # last pending token type
- self.__previous_pending_token_type: int
- self.__last_pending_token_type_from_default_channel: int
-
- # The amount of opened parentheses, square brackets or curly braces
- self.__opened: int
- # The amount of opened parentheses and square brackets in the current lexer mode
- self.__paren_or_bracket_opened_stack: List[int]
- # A stack that stores expression(s) between braces in fstring
- self.__brace_expression_stack: List[str]
- self.__prev_brace_expression: str
-
- # Instead of self._mode (self._mode is not implemented in each ANTLR4 runtime)
- self.__cur_lexer_mode: int
- # Instead of self._modeStack (self._modeStack is not implemented in each ANTLR4 runtime)
- self.__lexer_mode_stack: List[int]
-
- self.__was_space_indentation: bool
- self.__was_tab_indentation: bool
- self.__was_indentation_mixed_with_spaces_and_tabs: bool
-
- self.__cur_token: CommonToken # current (under processing) token
- self.__ffg_token: CommonToken # following (look ahead) token
-
- self.__INVALID_LENGTH: int = -1
- self.__ERR_TXT: str = " ERROR: "
-
- self.__init()
-
- def nextToken(self) -> CommonToken: # reading the input stream until a return EOF
- self.__check_next_token()
- return self.__pending_tokens.popleft() # add the queued token to the token stream
-
- def reset(self) -> None:
- self.__init()
- super().reset()
-
- def __init(self) -> None:
- self.__indent_length_stack = []
- self.__pending_tokens = Deque()
- self.__previous_pending_token_type = 0
- self.__last_pending_token_type_from_default_channel = 0
- self.__opened = 0
- self.__paren_or_bracket_opened_stack = []
- self.__brace_expression_stack = []
- self.__prev_brace_expression = ""
- self.__cur_lexer_mode = 0
- self.__lexer_mode_stack = []
- self.__was_space_indentation = False
- self.__was_tab_indentation = False
- self.__was_indentation_mixed_with_spaces_and_tabs = False
- self.__cur_token = None
- self.__ffg_token = None
-
- def __check_next_token(self) -> None:
- if self.__previous_pending_token_type == Token.EOF:
- return
-
- if not self.__indent_length_stack: # We're at the first token
- self.__insert_ENCODING_token()
- self.__set_current_and_following_tokens()
- self.__handle_start_of_input()
- else:
- self.__set_current_and_following_tokens()
-
- match self.__cur_token.type:
- case self.NEWLINE:
- self.__handle_NEWLINE_token()
- case self.LPAR | self.LSQB | self.LBRACE:
- self.__opened += 1
- self.__add_pending_token(self.__cur_token)
- case self.RPAR | self.RSQB | self.RBRACE:
- self.__opened -= 1
- self.__add_pending_token(self.__cur_token)
- case self.FSTRING_MIDDLE:
- self.__handle_FSTRING_MIDDLE_token_with_double_brace() # does not affect the opened field
- self.__add_pending_token(self.__cur_token)
- case self.COLONEQUAL:
- self.__handle_COLONEQUAL_token_in_fstring()
- case self.ERRORTOKEN:
- self.__report_lexer_error("token recognition error at: '" + self.__cur_token.text + "'")
- self.__add_pending_token(self.__cur_token)
- case Token.EOF:
- self.__handle_EOF_token()
- case _:
- self.__add_pending_token(self.__cur_token)
- self.__handle_FORMAT_SPECIFICATION_MODE()
-
- def __set_current_and_following_tokens(self) -> None:
- self.__cur_token = super().nextToken() if self.__ffg_token is None else \
- self.__ffg_token
-
- self.__check_cur_token() # ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)!
-
- self.__ffg_token = self.__cur_token if self.__cur_token.type == Token.EOF else \
- super().nextToken()
-
- def __insert_ENCODING_token(self) -> None: # https://peps.python.org/pep-0263/
- line_builder: list[str] = []
- encoding_name: str = ""
- line_count: int = 0
- ws_comment_pattern: re.Pattern = re.compile(r"^[ \t\f]*(#.*)?$")
- input_stream: InputStream = self.inputStream
- size: int = input_stream.size
-
- input_stream.seek(0)
- for i in range(size):
- c: str = chr(input_stream.LA(i + 1))
- line_builder.append(c)
-
- if c == '\n' or i == size - 1:
- line: str = ''.join(line_builder).replace("\r", "").replace("\n", "")
- if ws_comment_pattern.match(line): # WS* + COMMENT? found
- encoding_name = self.__get_encoding_name(line)
- if encoding_name:
- break # encoding found
- else:
- break # statement or backslash found (first line is not empty, not whitespace(s), not comment)
-
- line_count += 1
- if line_count >= 2:
- break # check only the first two lines
- line_builder = []
-
- if not encoding_name:
- encoding_name = "utf-8" # default Python source code encoding
-
- encoding_token: CommonToken = CommonToken((None, None), self.ENCODING, CommonToken.HIDDEN_CHANNEL, 0, 0)
- encoding_token.text = encoding_name
- encoding_token.line = 0
- encoding_token.column = -1
- self.__add_pending_token(encoding_token)
-
- def __get_encoding_name(self, comment_text: str) -> str: # https://peps.python.org/pep-0263/#defining-the-encoding
- encoding_comment_pattern: str = r"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)"
- match: Optional[re.Match] = re.search(encoding_comment_pattern, comment_text)
- return match.group(1) if match else ""
-
- # initialize the _indent_length_stack
- # hide the leading NEWLINE token(s)
- # if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
- # insert a leading INDENT token if necessary
- def __handle_start_of_input(self) -> None:
- # initialize the stack with a default 0 indentation length
- self.__indent_length_stack.append(0) # this will never be popped off
- while self.__cur_token.type != Token.EOF:
- if self.__cur_token.channel == Token.DEFAULT_CHANNEL:
- if self.__cur_token.type == self.NEWLINE:
- # all the NEWLINE tokens must be ignored before the first statement
- self.__hide_and_add_pending_token(self.__cur_token)
- else: # We're at the first statement
- self.__insert_leading_indent_token()
- return # continue the processing of the current token with __check_next_token()
- else:
- self.__add_pending_token(self.__cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
- self.__set_current_and_following_tokens()
- # continue the processing of the EOF token with __check_next_token()
-
- def __insert_leading_indent_token(self) -> None:
- if self.__previous_pending_token_type == self.WS:
- prev_token: CommonToken = self.__pending_tokens[-1] # WS token
- if self.__get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement
- err_msg: str = "first statement indented"
- self.__report_lexer_error(err_msg)
- # insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__cur_token)
-
- def __handle_NEWLINE_token(self) -> None:
- if self.__lexer_mode_stack: # not is_empty
- self.__add_pending_token(self.__cur_token)
- elif self.__opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token
- self.__hide_and_add_pending_token(self.__cur_token)
- else:
- nl_token: CommonToken = self.__cur_token.clone() # save the current NEWLINE token
- is_looking_ahead: bool = self.__ffg_token.type == self.WS
- if is_looking_ahead:
- self.__set_current_and_following_tokens() # set the next two tokens
-
- match self.__ffg_token.type:
- case self.NEWLINE | self.COMMENT:
- # We're before a blank line or a comment or type comment or a type ignore comment
- self.__hide_and_add_pending_token(nl_token) # ignore the NEWLINE token
- if is_looking_ahead:
- self.__add_pending_token(self.__cur_token) # WS token
- case _:
- self.__add_pending_token(nl_token)
- if is_looking_ahead: # We're on a whitespace(s) followed by a statement
- indentation_length: int = 0 if self.__ffg_token.type == Token.EOF else \
- self.__get_indentation_length(self.__cur_token.text)
-
- if indentation_length != self.__INVALID_LENGTH:
- self.__add_pending_token(self.__cur_token) # WS token
- self.__insert_INDENT_or_DEDENT_token(indentation_length) # may insert INDENT token or DEDENT token(s)
- else:
- self.__report_error("inconsistent use of tabs and spaces in indentation")
- else: # We're at a newline followed by a statement (there is no whitespace before the statement)
- self.__insert_INDENT_or_DEDENT_token(0) # may insert DEDENT token(s)
-
- def __insert_INDENT_or_DEDENT_token(self, indent_length: int) -> None:
- prev_indent_length: int = self.__indent_length_stack[-1] # stack peek
- if indent_length > prev_indent_length:
- self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token)
- self.__indent_length_stack.append(indent_length) # stack push
- else:
- while indent_length < prev_indent_length: # more than 1 DEDENT token may be inserted to the token stream
- self.__indent_length_stack.pop()
- prev_indent_length = self.__indent_length_stack[-1] # stack peek
- if indent_length <= prev_indent_length:
- self.__create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token)
- else:
- self.__report_error("inconsistent dedent")
-
- def __check_cur_token(self) -> None:
- match self.__cur_token.type:
- case self.FSTRING_START:
- self.__set_lexer_mode_by_FSTRING_START_token()
- return
- case self.FSTRING_MIDDLE:
- self.__handle_FSTRING_MIDDLE_token_with_quote_and_lbrace() # affect the opened field
- if self.__cur_token.type == self.FSTRING_MIDDLE:
- return # No __cur_token exchange happened
- case self.FSTRING_END:
- self.__pop_lexer_mode()
- return
- case _:
- if not self.__lexer_mode_stack:
- return # Not in fstring mode
-
- match self.__cur_token.type: # the following tokens can only come from default mode (after an LBRACE in fstring)
- case self.NEWLINE:
- # append the current brace expression with the current newline
- self.__append_to_brace_expression(self.__cur_token.text)
- self.__cur_token.channel = Token.HIDDEN_CHANNEL
- case self.LBRACE:
- # the outermost brace expression cannot be a dictionary comprehension or a set comprehension
- self.__brace_expression_stack.append("{")
- self.__paren_or_bracket_opened_stack.append(0) # stack push
- self.__push_lexer_mode(Lexer.DEFAULT_MODE)
- case self.LPAR | self.LSQB:
- # append the current brace expression with a "(" or a "["
- self.__append_to_brace_expression(self.__cur_token.text)
- # https://peps.python.org/pep-0498/#lambdas-inside-expressions
- self.__increment_brace_stack()
- case self.RPAR | self.RSQB:
- # append the current brace expression with a ")" or a "]"
- self.__append_to_brace_expression(self.__cur_token.text)
- self.__decrement_brace_stack()
- case self.COLON | self.COLONEQUAL:
- # append the current brace expression with a ":" or a ":="
- self.__append_to_brace_expression(self.__cur_token.text)
- self.__set_lexer_mode_by_COLON_or_COLONEQUAL_token()
- case self.RBRACE:
- self.__set_lexer_mode_after_RBRACE_token()
- case _:
- # append the current brace expression with the current token text
- self.__append_to_brace_expression(self.__cur_token.text)
-
- def __append_to_brace_expression(self, text: str) -> None:
- self.__brace_expression_stack[-1] += text
-
- def __increment_brace_stack(self) -> None: # increment the last element (peek() + 1)
- self.__paren_or_bracket_opened_stack[-1] += 1
-
- def __decrement_brace_stack(self) -> None: # decrement the last element (peek() - 1)
- self.__paren_or_bracket_opened_stack[-1] -= 1
-
- def __set_lexer_mode_after_RBRACE_token(self) -> None:
- match self.__cur_lexer_mode:
- case Lexer.DEFAULT_MODE:
- self.__pop_lexer_mode() # only once
- self.__pop_by_RBRACE()
-
- case self.SQ1__FORMAT_SPECIFICATION_MODE \
- | self.SQ1R_FORMAT_SPECIFICATION_MODE \
- | self.DQ1__FORMAT_SPECIFICATION_MODE \
- | self.DQ1R_FORMAT_SPECIFICATION_MODE \
- | self.SQ3__FORMAT_SPECIFICATION_MODE \
- | self.SQ3R_FORMAT_SPECIFICATION_MODE \
- | self.DQ3__FORMAT_SPECIFICATION_MODE \
- | self.DQ3R_FORMAT_SPECIFICATION_MODE:
-
- self.__pop_lexer_mode()
- self.__pop_lexer_mode()
- self.__pop_by_RBRACE()
- case _:
- self.__report_lexer_error("f-string: single '}' is not allowed")
-
- def __set_lexer_mode_by_FSTRING_START_token(self) -> None:
- text = self.__cur_token.text.lower()
- mode_map = {
- "f'": self.SQ1__FSTRING_MODE,
- "rf'": self.SQ1R_FSTRING_MODE,
- "fr'": self.SQ1R_FSTRING_MODE,
- 'f"': self.DQ1__FSTRING_MODE,
- 'rf"': self.DQ1R_FSTRING_MODE,
- 'fr"': self.DQ1R_FSTRING_MODE,
- "f'''": self.SQ3__FSTRING_MODE,
- "rf'''": self.SQ3R_FSTRING_MODE,
- "fr'''": self.SQ3R_FSTRING_MODE,
- 'f"""': self.DQ3__FSTRING_MODE,
- 'rf"""': self.DQ3R_FSTRING_MODE,
- 'fr"""': self.DQ3R_FSTRING_MODE,
- }
- mode = mode_map.get(text)
- if mode is not None:
- self.__push_lexer_mode(mode)
-
- def __set_lexer_mode_by_COLON_or_COLONEQUAL_token(self) -> None:
- if self.__paren_or_bracket_opened_stack[-1] == 0: # stack peek == 0
- # COLONEQUAL token will be replaced with a COLON token in checkNextToken()
- match self.__lexer_mode_stack[-1]: # check the previous lexer mode (the current is DEFAULT_MODE)
- case self.SQ1__FSTRING_MODE \
- | self.SQ1__FORMAT_SPECIFICATION_MODE:
-
- self.__push_lexer_mode(self.SQ1__FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
- case self.SQ1R_FSTRING_MODE \
- | self.SQ1R_FORMAT_SPECIFICATION_MODE:
-
- self.__push_lexer_mode(self.SQ1R_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
- case self.DQ1__FSTRING_MODE \
- | self.DQ1__FORMAT_SPECIFICATION_MODE:
-
- self.__push_lexer_mode(self.DQ1__FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
- case self.DQ1R_FSTRING_MODE \
- | self.DQ1R_FORMAT_SPECIFICATION_MODE:
-
- self.__push_lexer_mode(self.DQ1R_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
- case self.SQ3__FSTRING_MODE \
- | self.SQ3__FORMAT_SPECIFICATION_MODE:
-
- self.__push_lexer_mode(self.SQ3__FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
- case self.SQ3R_FSTRING_MODE \
- | self.SQ3R_FORMAT_SPECIFICATION_MODE:
-
- self.__push_lexer_mode(self.SQ3R_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
- case self.DQ3__FSTRING_MODE \
- | self.DQ3__FORMAT_SPECIFICATION_MODE:
-
- self.__push_lexer_mode(self.DQ3__FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
- case self.DQ3R_FSTRING_MODE \
- | self.DQ3R_FORMAT_SPECIFICATION_MODE:
-
- self.__push_lexer_mode(self.DQ3R_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
-
- def __pop_by_RBRACE(self) -> None:
- self.__paren_or_bracket_opened_stack.pop()
- self.__prev_brace_expression = self.__brace_expression_stack.pop() + "}"
- if self.__brace_expression_stack:
- # append the current brace expression with the previous brace expression
- self.__brace_expression_stack[-1] += self.__prev_brace_expression
-
- def __handle_FSTRING_MIDDLE_token_with_double_brace(self) -> None:
- # replace the trailing double brace with a single brace and insert a hidden brace token
- match self.__get_last_two_chars_of_the_cur_token_text():
- case "{{":
- self.__trim_last_char_add_pending_token_set_cur_token(self.LBRACE, "{", Token.HIDDEN_CHANNEL)
- case "}}":
- self.__trim_last_char_add_pending_token_set_cur_token(self.RBRACE, "}", Token.HIDDEN_CHANNEL)
-
- def __handle_FSTRING_MIDDLE_token_with_quote_and_lbrace(self) -> None:
- # replace the trailing quote + left_brace with a quote and insert an LBRACE token
- # replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
- match self.__get_last_two_chars_of_the_cur_token_text():
- case "\"{" | "'{" | "\\{":
- self.__trim_last_char_add_pending_token_set_cur_token(self.LBRACE, "{", Token.DEFAULT_CHANNEL)
-
- def __get_last_two_chars_of_the_cur_token_text(self) -> str:
- cur_token_text: str = self.__cur_token.text
- return cur_token_text[-2:] if len(cur_token_text) >= 2 else cur_token_text
-
- def __trim_last_char_add_pending_token_set_cur_token(self, type: int, text: str, channel: int) -> None:
- # trim the last char and add the modified curToken to the __pending_tokens stack
- token_text_without_lbrace: str = self.__cur_token.text[:-1]
- self.__cur_token.text = token_text_without_lbrace
- self.__cur_token.stop -= 1
- self.__add_pending_token(self.__cur_token)
-
- self.__create_new_cur_token(type, text, channel) # set __cur_token
-
- def __handle_COLONEQUAL_token_in_fstring(self) -> None:
- if self.__lexer_mode_stack \
- and self.__paren_or_bracket_opened_stack[-1] == 0: # stack peek == 0
-
- # In fstring a colonequal (walrus operator) can only be used in parentheses
- # Not in parentheses, replace COLONEQUAL token with COLON as format specifier
- # and insert the equal symbol to the following FSTRING_MIDDLE token
- self.__cur_token.type = self.COLON
- self.__cur_token.text = ":"
- self.__cur_token.stop = self.__cur_token.start
- if self.__ffg_token.type == self.FSTRING_MIDDLE:
- self.__ffg_token.text = "=" + self.__ffg_token.text
- self.__ffg_token.start -= 1
- self.__ffg_token.column -= 1
- else:
- self.__add_pending_token(self.__cur_token)
- self.__create_new_current_token(self.FSTRING_MIDDLE, "=", Token.DEFAULT_CHANNEL)
- self.__add_pending_token(self.__cur_token)
-
- def __create_new_cur_token(self, type: int, text: str, channel: int) -> None:
- ctkn: CommonToken = self.__cur_token.clone()
- ctkn.type = type
- ctkn.text = text
- ctkn.channel = channel
- ctkn.column += 1
- ctkn.start += 1
- ctkn.stop = ctkn.start
- self.__cur_token = ctkn
-
- def __push_lexer_mode(self, mode: int) -> None:
- self.pushMode(mode)
- self.__lexer_mode_stack.append(self.__cur_lexer_mode) # stack push
- self.__cur_lexer_mode = mode
-
- def __pop_lexer_mode(self) -> None:
- self.popMode()
- self.__cur_lexer_mode = self.__lexer_mode_stack.pop()
-
- def __handle_FORMAT_SPECIFICATION_MODE(self) -> None:
- if self.__lexer_mode_stack \
- and self.__ffg_token.type == self.RBRACE:
-
- # insert an empty FSTRING_MIDDLE token instead of the missing format specification
- match self.__cur_token.type:
- case self.COLON:
- self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", self.__ffg_token)
- case self.RBRACE:
- # only if the previous brace expression is not a dictionary comprehension or set comprehension
- if not self.__is_dictionary_comprehension_or_set_comprehension(self.__prev_brace_expression):
- self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", self.__ffg_token)
-
- def __is_dictionary_comprehension_or_set_comprehension(self, code: str) -> bool:
- from antlr4 import InputStream, CommonTokenStream
- from PythonLexer import PythonLexer
- from PythonParser import PythonParser
-
- input_stream: InputStream = InputStream(code)
- lexer: PythonLexer = PythonLexer(input_stream)
- token_stream: CommonTokenStream = CommonTokenStream(lexer)
- parser: PythonParser = PythonParser(token_stream)
-
- # Disable error listeners to suppress console output
- lexer.removeErrorListeners()
- parser.removeErrorListeners()
-
- parser.dictcomp() # Try parsing as dictionary comprehension
- if parser.getNumberOfSyntaxErrors() == 0:
- return True
-
- parser = PythonParser(token_stream)
- token_stream.seek(0)
- parser.removeErrorListeners()
- parser.setcomp() # Try parsing as set comprehension
- return parser.getNumberOfSyntaxErrors() == 0
-
- def __insert_trailing_tokens(self) -> None:
- match self.__last_pending_token_type_from_default_channel:
- case self.NEWLINE | self.DEDENT:
- pass # no trailing NEWLINE token is needed
- case _: # insert an extra trailing NEWLINE token that serves as the end of the last statement
- self.__create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.__ffg_token) # _ffg_token is EOF
- self.__insert_INDENT_or_DEDENT_token(0) # Now insert as much trailing DEDENT tokens as needed
-
- def __handle_EOF_token(self) -> None:
- if self.__last_pending_token_type_from_default_channel > 0:
- # there was statement in the input (leading NEWLINE tokens are hidden)
- self.__insert_trailing_tokens()
- self.__add_pending_token(self.__cur_token)
-
- def __hide_and_add_pending_token(self, ctkn: CommonToken) -> None:
- ctkn.channel = Token.HIDDEN_CHANNEL
- self.__add_pending_token(ctkn)
-
- def __create_and_add_pending_token(self, ttype: int, channel: int, text: Optional[str], sample_token: CommonToken) -> None:
- ctkn: CommonToken = sample_token.clone()
- ctkn.type = ttype
- ctkn.channel = channel
- ctkn.stop = sample_token.start - 1
- ctkn.text = "<" + self.symbolicNames[ttype] + ">" if text is None else \
- text
-
- self.__add_pending_token(ctkn)
-
- def __add_pending_token(self, ctkn: CommonToken) -> None:
- # save the last pending token type because the _pending_tokens list can be empty by the nextToken()
- self.__previous_pending_token_type = ctkn.type
- if ctkn.channel == Token.DEFAULT_CHANNEL:
- self.__last_pending_token_type_from_default_channel = self.__previous_pending_token_type
- self.__pending_tokens.append(ctkn)
-
- def __get_indentation_length(self, indentText: str) -> int: # the indentText may contain spaces, tabs or form feeds
- TAB_LENGTH: int = 8 # the standard number of spaces to replace a tab with spaces
- length: int = 0
- ch: str
- for ch in indentText:
- match ch:
- case ' ':
- self.__was_space_indentation = True
- length += 1
- case '\t':
- self.__was_tab_indentation = True
- length += TAB_LENGTH - (length % TAB_LENGTH)
- case '\f': # form feed
- length = 0
-
- if self.__was_tab_indentation and self.__was_space_indentation:
- if not self.__was_indentation_mixed_with_spaces_and_tabs:
- self.__was_indentation_mixed_with_spaces_and_tabs = True
- length = self.__INVALID_LENGTH # only for the first inconsistent indent
- return length
-
- def __report_lexer_error(self, err_msg: str) -> None:
- self.getErrorListenerDispatch().syntaxError(self, self.__cur_token, self.__cur_token.line, self.__cur_token.column, " LEXER" + self.__ERR_TXT + err_msg, None)
-
- def __report_error(self, err_msg: str) -> None:
- self.__report_lexer_error(err_msg)
-
- # the ERRORTOKEN will raise an error in the parser
- self.__create_and_add_pending_token(self.ERRORTOKEN, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__ffg_token)
diff --git a/python/python3_13/README.md b/python/python3_13/README.md
deleted file mode 100644
index 3f02d91e4f..0000000000
--- a/python/python3_13/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Python 3.13.2 parser
-
-### About files:
-- PythonParser.g4 is the ANTLR4 parser grammar that based on the official [Python PEG grammar](https://docs.python.org/3.13/reference/grammar.html)
-
-- PythonLexerBase class
- - handles the Python indentations
- - creates encoding token
- - tokenizes fstring literals
- - and manage many other things
-
-- Example files from: [Python 3.13 Standard Lib](https://github.com/python/cpython/tree/3.13/Lib)
-
-### Recent changes:
-- parser grammar update for Python 3.13.2
-- added ENCODING token
-- complete rewrite of fstring tokenizer in lexer grammar and PythonLexerBase class
- - now correctly tokenizes the followings in fstring:
- - escape sequences
- - walrus operator
- - dictionary comprehension
- - set comprehension
-- soft keywords changes:
- - no embedded code (semantic predicates) in parser grammar for soft keywords
- - no need for PythonParserBase class
- - no need for transformGrammar.py
- - **BREAKING CHANGES**:
- - dedicated tokens for soft keywords instead of NAME token:
- - NAME_OR_TYPE
- - NAME_OR_MATCH
- - NAME_OR_CASE
- - NAME_OR_WILDCARD
-
-#### [Previous changes](https://github.com/antlr/grammars-v4/tree/master/python/python3_13)
-
-### Related link:
-[ANTLR4-parser-for-Python-3.13](https://github.com/RobEin/ANTLR4-parser-for-Python-3.13)
\ No newline at end of file
diff --git a/python/python3_13/TypeScript/PythonLexerBase.ts b/python/python3_13/TypeScript/PythonLexerBase.ts
deleted file mode 100644
index 5ba9b2062e..0000000000
--- a/python/python3_13/TypeScript/PythonLexerBase.ts
+++ /dev/null
@@ -1,677 +0,0 @@
-/*
-The MIT License (MIT)
-Copyright (c) 2021 Robert Einhorn
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
- */
-
-/*
- *
- * Project : Python Indent/Dedent handler for ANTLR4 grammars
- *
- * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com
- *
- */
-
-import { CharStream, CharStreams, CommonTokenStream, Token, CommonToken, Lexer, TokenStream } from "antlr4";
-import PythonLexer from "./PythonLexer";
-import PythonParser from "./PythonParser";
-import * as Collections from "typescript-collections";
-
-export default abstract class PythonLexerBase extends Lexer {
- // A stack that keeps track of the indentation lengths
- private indentLengthStack!: Collections.Stack;
- // A list where tokens are waiting to be loaded into the token stream
- private pendingTokens!: Array;
-
- // last pending token types
- private previousPendingTokenType!: number;
- private lastPendingTokenTypeFromDefaultChannel!: number;
-
- // The amount of opened parentheses, square brackets or curly braces
- private opened!: number;
- // The amount of opened parentheses and square brackets in the current lexer mode
- private paren_or_bracket_openedStack!: Array;
- // A stack that stores expression(s) between braces in fstring
- private braceExpressionStack!: Array;
- private prevBraceExpression!: string;
-
- // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime)
- private curLexerMode!: number;
- // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime)
- private lexerModeStack!: Array;
-
- private wasSpaceIndentation!: boolean;
- private wasTabIndentation!: boolean;
- private wasIndentationMixedWithSpacesAndTabs!: boolean;
-
- private curToken: Token | undefined; // current (under processing) token
- private ffgToken: Token | undefined; // following (look ahead) token
-
- private readonly INVALID_LENGTH: number = -1;
- private readonly ERR_TXT: string = " ERROR: ";
-
- protected constructor(input: CharStream) {
- super(input);
- this.init();
- }
-
- public nextToken(): Token { // reading the input stream until a return EOF
- this.checkNextToken();
- return this.pendingTokens.shift()! /* .pollFirst() */; // add the queued token to the token stream
- }
-
- public reset(): void {
- this.init();
- super.reset();
- }
-
- private init(): void {
- this.indentLengthStack = new Collections.Stack();
- this.pendingTokens = [];
- this.previousPendingTokenType = 0;
- this.lastPendingTokenTypeFromDefaultChannel = 0;
- this.opened = 0;
- this.paren_or_bracket_openedStack = [];
- this.braceExpressionStack = [];
- this.prevBraceExpression = "";
- this.curLexerMode = 0;
- this.lexerModeStack = [];
- this.wasSpaceIndentation = false;
- this.wasTabIndentation = false;
- this.wasIndentationMixedWithSpacesAndTabs = false;
- this.curToken = undefined;
- this.ffgToken = undefined;
- }
-
- private checkNextToken(): void {
- if (this.previousPendingTokenType == PythonLexer.EOF)
- return;
-
- if (this.indentLengthStack.isEmpty()) { // We're at the first token
- this.insertENCODINGtoken();
- this.setCurrentAndFollowingTokens();
- this.handleStartOfInput();
- } else {
- this.setCurrentAndFollowingTokens();
- }
-
- switch (this.curToken!.type) {
- case PythonLexer.NEWLINE:
- this.handleNEWLINEtoken();
- break;
- case PythonLexer.LPAR:
- case PythonLexer.LSQB:
- case PythonLexer.LBRACE:
- this.opened++;
- this.addPendingToken(this.curToken!);
- break;
- case PythonLexer.RPAR:
- case PythonLexer.RSQB:
- case PythonLexer.RBRACE:
- this.opened--;
- this.addPendingToken(this.curToken!);
- break;
- case PythonLexer.FSTRING_MIDDLE:
- this.handleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field
- this.addPendingToken(this.curToken!);
- break;
- case PythonLexer.COLONEQUAL:
- this.handleCOLONEQUALtokenInFString();
- break;
- case PythonLexer.ERRORTOKEN:
- this.reportLexerError(`token recognition error at: '${this.curToken!.text}'`);
- this.addPendingToken(this.curToken!);
- break;
- case PythonLexer.EOF:
- this.handleEOFtoken();
- break;
- default:
- this.addPendingToken(this.curToken!);
- }
- this.handleFORMAT_SPECIFICATION_MODE();
- }
-
- private setCurrentAndFollowingTokens(): void {
- this.curToken = this.ffgToken == undefined
- ? super.nextToken()
- : this.ffgToken;
-
- this.checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)!
-
- this.ffgToken = this.curToken.type === PythonLexer.EOF
- ? this.curToken
- : super.nextToken();
- }
-
- private insertENCODINGtoken(): void { // https://peps.python.org/pep-0263/
- let lineBuilder: string = '';
- let encodingName: string = '';
- let lineCount: number = 0;
- const ws_commentPattern: RegExp = /^[ \t\f]*(#.*)?$/;
- const charStream: CharStream = this._input;
- const size: number = charStream.size;
-
- charStream.seek(0);
- for (let i = 0; i < size; i++) {
- const c: string = String.fromCharCode(charStream.LA(i + 1));
- lineBuilder += c;
-
- if (c === '\n' || i === size - 1) {
- const line: string = lineBuilder.replace(/\r/g, '').replace(/\n/g, '');
- if (ws_commentPattern.test(line)) { // WS* + COMMENT? found
- encodingName = this.getEncodingName(line);
- if (encodingName !== '') {
- break; // encoding found
- }
- } else {
- break; // statement or backslash found (line is not empty, not whitespace(s), not comment)
- }
-
- lineCount++;
- if (lineCount >= 2) {
- break; // check only the first two lines
- }
- lineBuilder = '';
- }
- }
-
- if (encodingName === '') {
- encodingName = 'utf-8'; // default Python source code encoding
- }
-
- const encodingToken = new CommonToken([this, this._input], PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, 0, 0);
- encodingToken.text = encodingName;
- encodingToken.line = 0;
- encodingToken.column = -1;
- this.addPendingToken(encodingToken);
- }
-
- private getEncodingName(commentText: string): string { // https://peps.python.org/pep-0263/#defining-the-encoding
- const encodingCommentPattern: RegExp = /^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)/;
- const match: RegExpMatchArray | null = commentText.match(encodingCommentPattern);
- return match ? match[1] : '';
- }
-
- // initialize the indentLengthStack
- // hide the leading NEWLINE token(s)
- // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
- // insert a leading INDENT token if necessary
- private handleStartOfInput(): void {
- // initialize the stack with a default 0 indentation length
- this.indentLengthStack.push(0); // this will never be popped off
- while (this.curToken!.type !== PythonLexer.EOF) {
- if (this.curToken!.channel === Token.DEFAULT_CHANNEL) {
- if (this.curToken!.type === PythonLexer.NEWLINE) {
- // all the NEWLINE tokens must be ignored before the first statement
- this.hideAndAddPendingToken(this.curToken!);
- } else { // We're at the first statement
- this.insertLeadingIndentToken();
- return; // continue the processing of the current token with checkNextToken()
- }
- } else {
- this.addPendingToken(this.curToken!); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
- }
- this.setCurrentAndFollowingTokens();
- } // continue the processing of the EOF token with checkNextToken()
- }
-
- private insertLeadingIndentToken(): void {
- if (this.previousPendingTokenType === PythonLexer.WS) {
- const prevToken: Token = this.pendingTokens.at(-1)!; /* .peekLast() */ // WS token
- if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
- const errMsg: string = "first statement indented";
- this.reportLexerError(errMsg);
- // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken!);
- }
- }
- }
-
- private handleNEWLINEtoken(): void {
- if (this.lexerModeStack.length > 0) {
- this.addPendingToken(this.curToken!);
- } else if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
- this.hideAndAddPendingToken(this.curToken!);
- } else {
- const nlToken: Token = this.curToken?.clone()!; // save the current NEWLINE token
- const isLookingAhead: boolean = this.ffgToken!.type === PythonLexer.WS;
- if (isLookingAhead) {
- this.setCurrentAndFollowingTokens(); // set the next two tokens
- }
-
- switch (this.ffgToken!.type) {
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
- this.hideAndAddPendingToken(nlToken);
- if (isLookingAhead) {
- this.addPendingToken(this.curToken!); // WS token
- }
- break;
- default:
- this.addPendingToken(nlToken);
- if (isLookingAhead) { // We're on whitespace(s) followed by a statement
- const indentationLength: number = this.ffgToken!.type === PythonLexer.EOF ?
- 0 :
- this.getIndentationLength(this.curToken!.text);
-
- if (indentationLength !== this.INVALID_LENGTH) {
- this.addPendingToken(this.curToken!); // WS token
- this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
- } else {
- this.reportError("inconsistent use of tabs and spaces in indentation");
- }
- } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
- this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
- }
- }
- }
- }
-
- private insertIndentOrDedentToken(indentLength: number): void {
- let prevIndentLength: number = this.indentLengthStack.peek()!;
- if (indentLength > prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!);
- this.indentLengthStack.push(indentLength);
- } else {
- while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
- this.indentLengthStack.pop();
- prevIndentLength = this.indentLengthStack.peek()!;
- if (indentLength <= prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!);
- } else {
- this.reportError("inconsistent dedent");
- }
- }
- }
- }
-
- private checkCurToken(): void {
- switch (this.curToken!.type) {
- case PythonLexer.FSTRING_START:
- this.setLexerModeByFSTRING_STARTtoken();
- return;
- case PythonLexer.FSTRING_MIDDLE:
- this.handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field
- if (this.curToken!.type === PythonLexer.FSTRING_MIDDLE) {
- return;
- }
- break;
- case PythonLexer.FSTRING_END:
- this.popLexerMode();
- return;
- default:
- if (this.lexerModeStack.length === 0) {
- return;
- }
- }
-
- switch (this.curToken!.type) { // the following tokens can only come from default mode (after an LBRACE in fstring)
- case PythonLexer.NEWLINE:
- // append the current brace expression with the current newline
- this.appendToBraceExpression(this.curToken!.text);
- this.curToken!.channel = Token.HIDDEN_CHANNEL;
- break;
- case PythonLexer.LBRACE:
- // the outermost brace expression cannot be a dictionary comprehension or a set comprehension
- this.braceExpressionStack.push("{");
- this.paren_or_bracket_openedStack.push(0);
- this.pushLexerMode(Lexer.DEFAULT_MODE);
- break;
- case PythonLexer.LPAR:
- case PythonLexer.LSQB:
- // append the current brace expression with a "(" or a "["
- this.appendToBraceExpression(this.curToken!.text);
- // https://peps.python.org/pep-0498/#lambdas-inside-expressions
- this.incrementBraceStack();
- break;
- case PythonLexer.RPAR:
- case PythonLexer.RSQB:
- // append the current brace expression with a ")" or a "]"
- this.appendToBraceExpression(this.curToken!.text);
- this.decrementBraceStack();
- break;
- case PythonLexer.COLON:
- case PythonLexer.COLONEQUAL:
- // append the current brace expression with a ":" or a ":="
- this.appendToBraceExpression(this.curToken!.text);
- this.setLexerModeByCOLONorCOLONEQUALtoken();
- break;
- case PythonLexer.RBRACE:
- this.setLexerModeAfterRBRACEtoken();
- break;
- default:
- // append the current brace expression with the current token text
- this.appendToBraceExpression(this.curToken!.text);
- }
- }
-
- private appendToBraceExpression(text: string): void {
- this.braceExpressionStack[this.braceExpressionStack.length - 1] += text;
- }
-
- private incrementBraceStack(): void { // increment the last element (peek() + 1)
- this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1]++;
- }
-
- private decrementBraceStack(): void { // decrement the last element (peek() - 1)
- this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1]--;
- }
-
- private setLexerModeAfterRBRACEtoken(): void {
- switch (this.curLexerMode) {
- case Lexer.DEFAULT_MODE:
- this.popLexerMode();
- this.popByBRACE();
- break;
- case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE:
- case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE:
- this.popLexerMode();
- this.popLexerMode();
- this.popByBRACE();
- break;
- default:
- this.reportLexerError("f-string: single '}' is not allowed");
- }
- }
-
- private setLexerModeByFSTRING_STARTtoken(): void {
- const text = this.curToken!.text.toLowerCase();
- const modeMap: { [key: string]: number } = {
- "f'": PythonLexer.SQ1__FSTRING_MODE,
- "rf'": PythonLexer.SQ1R_FSTRING_MODE,
- "fr'": PythonLexer.SQ1R_FSTRING_MODE,
- 'f"': PythonLexer.DQ1__FSTRING_MODE,
- 'rf"': PythonLexer.DQ1R_FSTRING_MODE,
- 'fr"': PythonLexer.DQ1R_FSTRING_MODE,
- "f'''": PythonLexer.SQ3__FSTRING_MODE,
- "rf'''": PythonLexer.SQ3R_FSTRING_MODE,
- "fr'''": PythonLexer.SQ3R_FSTRING_MODE,
- 'f"""': PythonLexer.DQ3__FSTRING_MODE,
- 'rf"""': PythonLexer.DQ3R_FSTRING_MODE,
- 'fr"""': PythonLexer.DQ3R_FSTRING_MODE,
- };
- const mode = modeMap[text];
- if (mode !== undefined) {
- this.pushLexerMode(mode);
- }
- }
-
- private setLexerModeByCOLONorCOLONEQUALtoken(): void {
- if (this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1] === 0) { // stack peek == 0
- const previousMode = this.lexerModeStack[this.lexerModeStack.length - 1]; // stack peek
- switch (previousMode) { // check the previous lexer mode (the current is DEFAULT_MODE)
- case PythonLexer.SQ1__FSTRING_MODE:
- case PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.SQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ1R_FSTRING_MODE:
- case PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.SQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ1__FSTRING_MODE:
- case PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.DQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ1R_FSTRING_MODE:
- case PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.DQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ3__FSTRING_MODE:
- case PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.SQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.SQ3R_FSTRING_MODE:
- case PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.SQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ3__FSTRING_MODE:
- case PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.DQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- case PythonLexer.DQ3R_FSTRING_MODE:
- case PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE:
- this.pushLexerMode(PythonLexer.DQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
- break;
- }
- }
- }
-
- private popByBRACE(): void {
- this.paren_or_bracket_openedStack.pop();
- this.prevBraceExpression = this.braceExpressionStack.pop() + "}";
- if (this.braceExpressionStack.length > 0) {
- // append the current brace expression with the previous brace expression
- this.braceExpressionStack[this.braceExpressionStack.length - 1] += this.prevBraceExpression;
- }
- }
-
- private handleFSTRING_MIDDLEtokenWithDoubleBrace(): void {
- // Replace the trailing double brace with a single brace and insert a hidden brace token
- switch (this.getLastTwoCharsOfTheCurTokenText()) {
- case "{{":
- this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL);
- break;
- case "}}":
- this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL);
- break;
- }
- }
-
- private handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(): void {
- // Replace the trailing quote + left_brace with a quote and insert an LBRACE token
- // Replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
- switch (this.getLastTwoCharsOfTheCurTokenText()) {
- case "\"{":
- case "'{":
- case "\\{":
- this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL);
- break;
- }
- }
-
- private getLastTwoCharsOfTheCurTokenText(): string {
- return this.curToken!.text.slice(-2);
- }
-
- private trimLastCharAddPendingTokenSetCurToken(type: number, text: string, channel: number): void {
- // Trim the last char and add the modified curToken to the pendingTokens stack
- const tokenTextWithoutLastChar = this.curToken!.text.slice(0, -1);
- this.curToken!.text = tokenTextWithoutLastChar;
- this.curToken!.stop -= 1;
- this.addPendingToken(this.curToken!);
-
- this.createNewCurToken(type, text, channel); // Set curToken
- }
-
- private handleCOLONEQUALtokenInFString(): void {
- if (
- this.lexerModeStack.length > 0 &&
- this.paren_or_bracket_openedStack[this.paren_or_bracket_openedStack.length - 1] === 0 // stack peek == 0
- ) {
- // In fstring, a colonequal (walrus operator) can only be used in parentheses
- // Not in parentheses, replace COLONEQUAL token with COLON as format specifier
- // and insert the equal symbol to the following FSTRING_MIDDLE token
- this.curToken!.type = PythonLexer.COLON;
- this.curToken!.text = ":";
- this.curToken!.stop = this.curToken!.start;
-
- if (this.ffgToken!.type === PythonLexer.FSTRING_MIDDLE) {
- this.ffgToken!.text = "=" + this.ffgToken!.text;
- this.ffgToken!.start -= 1;
- this.ffgToken!.column -= 1;
- } else {
- this.addPendingToken(this.curToken!);
- this.createNewCurToken(PythonLexer.FSTRING_MIDDLE, "=", Token.DEFAULT_CHANNEL);
- }
- }
- this.addPendingToken(this.curToken!);
- }
-
- private createNewCurToken(type: number, text: string, channel: number): void {
- const ctkn = this.curToken!.clone();
- ctkn.type = type;
- ctkn.text = text;
- ctkn.channel = channel;
- ctkn.column += 1;
- ctkn.start += 1;
- ctkn.stop = ctkn.start;
- this.curToken = ctkn;
- }
-
- private pushLexerMode(mode: number): void {
- this.pushMode(mode);
- this.lexerModeStack.push(this.curLexerMode);
- this.curLexerMode = mode;
- }
-
- private popLexerMode(): void {
- this.popMode();
- this.curLexerMode = this.lexerModeStack.pop()!;
- }
-
- private handleFORMAT_SPECIFICATION_MODE() {
- if (this.lexerModeStack.length > 0 &&
- this.ffgToken!.type === PythonLexer.RBRACE) {
-
- // insert an empty FSTRING_MIDDLE token instead of the missing format specification
- switch (this.curToken!.type) {
- case PythonLexer.COLON:
- this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken!);
- break;
- case PythonLexer.RBRACE:
- // only if the previous brace expression is not a dictionary comprehension or set comprehension
- if (!this.isDictionaryComprehensionOrSetComprehension(this.prevBraceExpression)) {
- this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken!);
- }
- break;
- }
- }
- }
-
- private isDictionaryComprehensionOrSetComprehension(code: string): boolean {
- const inputStream: CharStream = CharStreams.fromString(code);
- const lexer = new PythonLexer(inputStream);
- const tokenStream = new CommonTokenStream(lexer);
- let parser = new PythonParser(tokenStream);
-
- // Disable error listeners to suppress console output
- lexer.removeErrorListeners();
- parser.removeErrorListeners();
-
- parser.dictcomp(); // Try parsing as dictionary comprehension
- if (parser.syntaxErrorsCount === 0)
- return true;
-
- parser = new PythonParser(tokenStream);
- (tokenStream as any).seek(0); // seek method is not declared in CommonTokenStream.d.ts
- parser.removeErrorListeners();
- parser.setcomp(); // Try parsing as set comprehension
- return parser.syntaxErrorsCount === 0;
- }
-
- private insertTrailingTokens(): void {
- switch (this.lastPendingTokenTypeFromDefaultChannel) {
- case PythonLexer.NEWLINE:
- case PythonLexer.DEDENT:
- break; // no trailing NEWLINE token is needed
- default:
- // insert an extra trailing NEWLINE token that serves as the end of the last statement
- this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken!); // ffgToken is EOF
- }
- this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
- }
-
- private handleEOFtoken(): void {
- if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
- // there was a statement in the input (leading NEWLINE tokens are hidden)
- this.insertTrailingTokens();
- }
- this.addPendingToken(this.curToken!);
- }
-
- private hideAndAddPendingToken(tkn: Token): void {
- tkn.channel = Token.HIDDEN_CHANNEL;
- this.addPendingToken(tkn);
- }
-
- private createAndAddPendingToken(type: number, channel: number, text: string | null, sampleToken: Token): void {
- const tkn: Token = sampleToken.clone();
- tkn.type = type;
- tkn.channel = channel;
- tkn.stop = sampleToken.start - 1;
- tkn.text = text == null ?
- `<${this.getSymbolicNames()[type]}>` :
- text;
-
- this.addPendingToken(tkn);
- }
-
- private addPendingToken(tkn: Token): void {
- // save the last pending token type because the pendingTokens list can be empty by the nextToken()
- this.previousPendingTokenType = tkn.type;
- if (tkn.channel === Token.DEFAULT_CHANNEL) {
- this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
- }
- this.pendingTokens.push(tkn) /* .addLast(token) */;
- }
-
- private getIndentationLength(indentText: string): number { // the indentText may contain spaces, tabs or form feeds
- const TAB_LENGTH: number = 8; // the standard number of spaces to replace a tab to spaces
- let length: number = 0;
- for (let ch of indentText) {
- switch (ch) {
- case " ":
- this.wasSpaceIndentation = true;
- length += 1;
- break;
- case "\t":
- this.wasTabIndentation = true;
- length += TAB_LENGTH - (length % TAB_LENGTH);
- break;
- case "\f": // form feed
- length = 0;
- break;
- }
- }
-
- if (this.wasTabIndentation && this.wasSpaceIndentation) {
- if (!this.wasIndentationMixedWithSpacesAndTabs) {
- this.wasIndentationMixedWithSpacesAndTabs = true;
- length = this.INVALID_LENGTH; // only for the first inconsistent indent
- }
- }
- return length;
- }
-
- private reportLexerError(errMsg: string): void {
- this.getErrorListener().syntaxError(this, 0 /* this.curToken */, this.curToken!.line, this.curToken!.column, " LEXER" + this.ERR_TXT + errMsg, undefined);
- }
-
- private reportError(errMsg: string): void {
- this.reportLexerError(errMsg);
-
- // the ERRORTOKEN will raise an error in the parser
- this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken!);
- }
-}
diff --git a/python/python3_13/examples/_colorize.py b/python/python3_13/examples/_colorize.py
deleted file mode 100644
index 845fb57a90..0000000000
--- a/python/python3_13/examples/_colorize.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import io
-import os
-import sys
-
-COLORIZE = True
-
-
-class ANSIColors:
- BOLD_GREEN = "\x1b[1;32m"
- BOLD_MAGENTA = "\x1b[1;35m"
- BOLD_RED = "\x1b[1;31m"
- GREEN = "\x1b[32m"
- GREY = "\x1b[90m"
- MAGENTA = "\x1b[35m"
- RED = "\x1b[31m"
- RESET = "\x1b[0m"
- YELLOW = "\x1b[33m"
-
-
-NoColors = ANSIColors()
-
-for attr in dir(NoColors):
- if not attr.startswith("__"):
- setattr(NoColors, attr, "")
-
-
-def get_colors(colorize: bool = False) -> ANSIColors:
- if colorize or can_colorize():
- return ANSIColors()
- else:
- return NoColors
-
-
-def can_colorize() -> bool:
- if sys.platform == "win32":
- try:
- import nt
-
- if not nt._supports_virtual_terminal():
- return False
- except (ImportError, AttributeError):
- return False
- if not sys.flags.ignore_environment:
- if os.environ.get("PYTHON_COLORS") == "0":
- return False
- if os.environ.get("PYTHON_COLORS") == "1":
- return True
- if "NO_COLOR" in os.environ:
- return False
- if not COLORIZE:
- return False
- if not sys.flags.ignore_environment:
- if "FORCE_COLOR" in os.environ:
- return True
- if os.environ.get("TERM") == "dumb":
- return False
-
- if not hasattr(sys.stderr, "fileno"):
- return False
-
- try:
- return os.isatty(sys.stderr.fileno())
- except io.UnsupportedOperation:
- return sys.stderr.isatty()
diff --git a/python/python3_13/examples/_compression.py b/python/python3_13/examples/_compression.py
deleted file mode 100644
index e8b70aa0a3..0000000000
--- a/python/python3_13/examples/_compression.py
+++ /dev/null
@@ -1,162 +0,0 @@
-"""Internal classes used by the gzip, lzma and bz2 modules"""
-
-import io
-import sys
-
-BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size
-
-
-class BaseStream(io.BufferedIOBase):
- """Mode-checking helper functions."""
-
- def _check_not_closed(self):
- if self.closed:
- raise ValueError("I/O operation on closed file")
-
- def _check_can_read(self):
- if not self.readable():
- raise io.UnsupportedOperation("File not open for reading")
-
- def _check_can_write(self):
- if not self.writable():
- raise io.UnsupportedOperation("File not open for writing")
-
- def _check_can_seek(self):
- if not self.readable():
- raise io.UnsupportedOperation("Seeking is only supported "
- "on files open for reading")
- if not self.seekable():
- raise io.UnsupportedOperation("The underlying file object "
- "does not support seeking")
-
-
-class DecompressReader(io.RawIOBase):
- """Adapts the decompressor API to a RawIOBase reader API"""
-
- def readable(self):
- return True
-
- def __init__(self, fp, decomp_factory, trailing_error=(), **decomp_args):
- self._fp = fp
- self._eof = False
- self._pos = 0 # Current offset in decompressed stream
-
- # Set to size of decompressed stream once it is known, for SEEK_END
- self._size = -1
-
- # Save the decompressor factory and arguments.
- # If the file contains multiple compressed streams, each
- # stream will need a separate decompressor object. A new decompressor
- # object is also needed when implementing a backwards seek().
- self._decomp_factory = decomp_factory
- self._decomp_args = decomp_args
- self._decompressor = self._decomp_factory(**self._decomp_args)
-
- # Exception class to catch from decompressor signifying invalid
- # trailing data to ignore
- self._trailing_error = trailing_error
-
- def close(self):
- self._decompressor = None
- return super().close()
-
- def seekable(self):
- return self._fp.seekable()
-
- def readinto(self, b):
- with memoryview(b) as view, view.cast("B") as byte_view:
- data = self.read(len(byte_view))
- byte_view[:len(data)] = data
- return len(data)
-
- def read(self, size=-1):
- if size < 0:
- return self.readall()
-
- if not size or self._eof:
- return b""
- data = None # Default if EOF is encountered
- # Depending on the input data, our call to the decompressor may not
- # return any data. In this case, try again after reading another block.
- while True:
- if self._decompressor.eof:
- rawblock = (self._decompressor.unused_data or
- self._fp.read(BUFFER_SIZE))
- if not rawblock:
- break
- # Continue to next stream.
- self._decompressor = self._decomp_factory(
- **self._decomp_args)
- try:
- data = self._decompressor.decompress(rawblock, size)
- except self._trailing_error:
- # Trailing data isn't a valid compressed stream; ignore it.
- break
- else:
- if self._decompressor.needs_input:
- rawblock = self._fp.read(BUFFER_SIZE)
- if not rawblock:
- raise EOFError("Compressed file ended before the "
- "end-of-stream marker was reached")
- else:
- rawblock = b""
- data = self._decompressor.decompress(rawblock, size)
- if data:
- break
- if not data:
- self._eof = True
- self._size = self._pos
- return b""
- self._pos += len(data)
- return data
-
- def readall(self):
- chunks = []
- # sys.maxsize means the max length of output buffer is unlimited,
- # so that the whole input buffer can be decompressed within one
- # .decompress() call.
- while data := self.read(sys.maxsize):
- chunks.append(data)
-
- return b"".join(chunks)
-
- # Rewind the file to the beginning of the data stream.
- def _rewind(self):
- self._fp.seek(0)
- self._eof = False
- self._pos = 0
- self._decompressor = self._decomp_factory(**self._decomp_args)
-
- def seek(self, offset, whence=io.SEEK_SET):
- # Recalculate offset as an absolute file position.
- if whence == io.SEEK_SET:
- pass
- elif whence == io.SEEK_CUR:
- offset = self._pos + offset
- elif whence == io.SEEK_END:
- # Seeking relative to EOF - we need to know the file's size.
- if self._size < 0:
- while self.read(io.DEFAULT_BUFFER_SIZE):
- pass
- offset = self._size + offset
- else:
- raise ValueError("Invalid value for whence: {}".format(whence))
-
- # Make it so that offset is the number of bytes to skip forward.
- if offset < self._pos:
- self._rewind()
- else:
- offset -= self._pos
-
- # Read and discard data until we reach the desired position.
- while offset > 0:
- data = self.read(min(io.DEFAULT_BUFFER_SIZE, offset))
- if not data:
- break
- offset -= len(data)
-
- return self._pos
-
- def tell(self):
- """Return the current file position."""
- return self._pos
diff --git a/python/python3_13/examples/_opcode_metadata.py b/python/python3_13/examples/_opcode_metadata.py
deleted file mode 100644
index b3d7b8103e..0000000000
--- a/python/python3_13/examples/_opcode_metadata.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# This file is generated by Tools/cases_generator/py_metadata_generator.py
-# from:
-# Python/bytecodes.c
-# Do not edit!
-_specializations = {
- "RESUME": [
- "RESUME_CHECK",
- ],
- "TO_BOOL": [
- "TO_BOOL_ALWAYS_TRUE",
- "TO_BOOL_BOOL",
- "TO_BOOL_INT",
- "TO_BOOL_LIST",
- "TO_BOOL_NONE",
- "TO_BOOL_STR",
- ],
- "BINARY_OP": [
- "BINARY_OP_MULTIPLY_INT",
- "BINARY_OP_ADD_INT",
- "BINARY_OP_SUBTRACT_INT",
- "BINARY_OP_MULTIPLY_FLOAT",
- "BINARY_OP_ADD_FLOAT",
- "BINARY_OP_SUBTRACT_FLOAT",
- "BINARY_OP_ADD_UNICODE",
- "BINARY_OP_INPLACE_ADD_UNICODE",
- ],
- "BINARY_SUBSCR": [
- "BINARY_SUBSCR_DICT",
- "BINARY_SUBSCR_GETITEM",
- "BINARY_SUBSCR_LIST_INT",
- "BINARY_SUBSCR_STR_INT",
- "BINARY_SUBSCR_TUPLE_INT",
- ],
- "STORE_SUBSCR": [
- "STORE_SUBSCR_DICT",
- "STORE_SUBSCR_LIST_INT",
- ],
- "SEND": [
- "SEND_GEN",
- ],
- "UNPACK_SEQUENCE": [
- "UNPACK_SEQUENCE_TWO_TUPLE",
- "UNPACK_SEQUENCE_TUPLE",
- "UNPACK_SEQUENCE_LIST",
- ],
- "STORE_ATTR": [
- "STORE_ATTR_INSTANCE_VALUE",
- "STORE_ATTR_SLOT",
- "STORE_ATTR_WITH_HINT",
- ],
- "LOAD_GLOBAL": [
- "LOAD_GLOBAL_MODULE",
- "LOAD_GLOBAL_BUILTIN",
- ],
- "LOAD_SUPER_ATTR": [
- "LOAD_SUPER_ATTR_ATTR",
- "LOAD_SUPER_ATTR_METHOD",
- ],
- "LOAD_ATTR": [
- "LOAD_ATTR_INSTANCE_VALUE",
- "LOAD_ATTR_MODULE",
- "LOAD_ATTR_WITH_HINT",
- "LOAD_ATTR_SLOT",
- "LOAD_ATTR_CLASS",
- "LOAD_ATTR_PROPERTY",
- "LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN",
- "LOAD_ATTR_METHOD_WITH_VALUES",
- "LOAD_ATTR_METHOD_NO_DICT",
- "LOAD_ATTR_METHOD_LAZY_DICT",
- "LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES",
- "LOAD_ATTR_NONDESCRIPTOR_NO_DICT",
- ],
- "COMPARE_OP": [
- "COMPARE_OP_FLOAT",
- "COMPARE_OP_INT",
- "COMPARE_OP_STR",
- ],
- "CONTAINS_OP": [
- "CONTAINS_OP_SET",
- "CONTAINS_OP_DICT",
- ],
- "FOR_ITER": [
- "FOR_ITER_LIST",
- "FOR_ITER_TUPLE",
- "FOR_ITER_RANGE",
- "FOR_ITER_GEN",
- ],
- "CALL": [
- "CALL_BOUND_METHOD_EXACT_ARGS",
- "CALL_PY_EXACT_ARGS",
- "CALL_TYPE_1",
- "CALL_STR_1",
- "CALL_TUPLE_1",
- "CALL_BUILTIN_CLASS",
- "CALL_BUILTIN_O",
- "CALL_BUILTIN_FAST",
- "CALL_BUILTIN_FAST_WITH_KEYWORDS",
- "CALL_LEN",
- "CALL_ISINSTANCE",
- "CALL_LIST_APPEND",
- "CALL_METHOD_DESCRIPTOR_O",
- "CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS",
- "CALL_METHOD_DESCRIPTOR_NOARGS",
- "CALL_METHOD_DESCRIPTOR_FAST",
- "CALL_ALLOC_AND_ENTER_INIT",
- "CALL_PY_GENERAL",
- "CALL_BOUND_METHOD_GENERAL",
- "CALL_NON_PY_GENERAL",
- ],
-}
-
-_specialized_opmap = {
- 'BINARY_OP_ADD_FLOAT': 150,
- 'BINARY_OP_ADD_INT': 151,
- 'BINARY_OP_ADD_UNICODE': 152,
- 'BINARY_OP_INPLACE_ADD_UNICODE': 3,
- 'BINARY_OP_MULTIPLY_FLOAT': 153,
- 'BINARY_OP_MULTIPLY_INT': 154,
- 'BINARY_OP_SUBTRACT_FLOAT': 155,
- 'BINARY_OP_SUBTRACT_INT': 156,
- 'BINARY_SUBSCR_DICT': 157,
- 'BINARY_SUBSCR_GETITEM': 158,
- 'BINARY_SUBSCR_LIST_INT': 159,
- 'BINARY_SUBSCR_STR_INT': 160,
- 'BINARY_SUBSCR_TUPLE_INT': 161,
- 'CALL_ALLOC_AND_ENTER_INIT': 162,
- 'CALL_BOUND_METHOD_EXACT_ARGS': 163,
- 'CALL_BOUND_METHOD_GENERAL': 164,
- 'CALL_BUILTIN_CLASS': 165,
- 'CALL_BUILTIN_FAST': 166,
- 'CALL_BUILTIN_FAST_WITH_KEYWORDS': 167,
- 'CALL_BUILTIN_O': 168,
- 'CALL_ISINSTANCE': 169,
- 'CALL_LEN': 170,
- 'CALL_LIST_APPEND': 171,
- 'CALL_METHOD_DESCRIPTOR_FAST': 172,
- 'CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS': 173,
- 'CALL_METHOD_DESCRIPTOR_NOARGS': 174,
- 'CALL_METHOD_DESCRIPTOR_O': 175,
- 'CALL_NON_PY_GENERAL': 176,
- 'CALL_PY_EXACT_ARGS': 177,
- 'CALL_PY_GENERAL': 178,
- 'CALL_STR_1': 179,
- 'CALL_TUPLE_1': 180,
- 'CALL_TYPE_1': 181,
- 'COMPARE_OP_FLOAT': 182,
- 'COMPARE_OP_INT': 183,
- 'COMPARE_OP_STR': 184,
- 'CONTAINS_OP_DICT': 185,
- 'CONTAINS_OP_SET': 186,
- 'FOR_ITER_GEN': 187,
- 'FOR_ITER_LIST': 188,
- 'FOR_ITER_RANGE': 189,
- 'FOR_ITER_TUPLE': 190,
- 'LOAD_ATTR_CLASS': 191,
- 'LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN': 192,
- 'LOAD_ATTR_INSTANCE_VALUE': 193,
- 'LOAD_ATTR_METHOD_LAZY_DICT': 194,
- 'LOAD_ATTR_METHOD_NO_DICT': 195,
- 'LOAD_ATTR_METHOD_WITH_VALUES': 196,
- 'LOAD_ATTR_MODULE': 197,
- 'LOAD_ATTR_NONDESCRIPTOR_NO_DICT': 198,
- 'LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES': 199,
- 'LOAD_ATTR_PROPERTY': 200,
- 'LOAD_ATTR_SLOT': 201,
- 'LOAD_ATTR_WITH_HINT': 202,
- 'LOAD_GLOBAL_BUILTIN': 203,
- 'LOAD_GLOBAL_MODULE': 204,
- 'LOAD_SUPER_ATTR_ATTR': 205,
- 'LOAD_SUPER_ATTR_METHOD': 206,
- 'RESUME_CHECK': 207,
- 'SEND_GEN': 208,
- 'STORE_ATTR_INSTANCE_VALUE': 209,
- 'STORE_ATTR_SLOT': 210,
- 'STORE_ATTR_WITH_HINT': 211,
- 'STORE_SUBSCR_DICT': 212,
- 'STORE_SUBSCR_LIST_INT': 213,
- 'TO_BOOL_ALWAYS_TRUE': 214,
- 'TO_BOOL_BOOL': 215,
- 'TO_BOOL_INT': 216,
- 'TO_BOOL_LIST': 217,
- 'TO_BOOL_NONE': 218,
- 'TO_BOOL_STR': 219,
- 'UNPACK_SEQUENCE_LIST': 220,
- 'UNPACK_SEQUENCE_TUPLE': 221,
- 'UNPACK_SEQUENCE_TWO_TUPLE': 222,
-}
-
-opmap = {
- 'CACHE': 0,
- 'RESERVED': 17,
- 'RESUME': 149,
- 'INSTRUMENTED_LINE': 254,
- 'BEFORE_ASYNC_WITH': 1,
- 'BEFORE_WITH': 2,
- 'BINARY_SLICE': 4,
- 'BINARY_SUBSCR': 5,
- 'CHECK_EG_MATCH': 6,
- 'CHECK_EXC_MATCH': 7,
- 'CLEANUP_THROW': 8,
- 'DELETE_SUBSCR': 9,
- 'END_ASYNC_FOR': 10,
- 'END_FOR': 11,
- 'END_SEND': 12,
- 'EXIT_INIT_CHECK': 13,
- 'FORMAT_SIMPLE': 14,
- 'FORMAT_WITH_SPEC': 15,
- 'GET_AITER': 16,
- 'GET_ANEXT': 18,
- 'GET_ITER': 19,
- 'GET_LEN': 20,
- 'GET_YIELD_FROM_ITER': 21,
- 'INTERPRETER_EXIT': 22,
- 'LOAD_ASSERTION_ERROR': 23,
- 'LOAD_BUILD_CLASS': 24,
- 'LOAD_LOCALS': 25,
- 'MAKE_FUNCTION': 26,
- 'MATCH_KEYS': 27,
- 'MATCH_MAPPING': 28,
- 'MATCH_SEQUENCE': 29,
- 'NOP': 30,
- 'POP_EXCEPT': 31,
- 'POP_TOP': 32,
- 'PUSH_EXC_INFO': 33,
- 'PUSH_NULL': 34,
- 'RETURN_GENERATOR': 35,
- 'RETURN_VALUE': 36,
- 'SETUP_ANNOTATIONS': 37,
- 'STORE_SLICE': 38,
- 'STORE_SUBSCR': 39,
- 'TO_BOOL': 40,
- 'UNARY_INVERT': 41,
- 'UNARY_NEGATIVE': 42,
- 'UNARY_NOT': 43,
- 'WITH_EXCEPT_START': 44,
- 'BINARY_OP': 45,
- 'BUILD_CONST_KEY_MAP': 46,
- 'BUILD_LIST': 47,
- 'BUILD_MAP': 48,
- 'BUILD_SET': 49,
- 'BUILD_SLICE': 50,
- 'BUILD_STRING': 51,
- 'BUILD_TUPLE': 52,
- 'CALL': 53,
- 'CALL_FUNCTION_EX': 54,
- 'CALL_INTRINSIC_1': 55,
- 'CALL_INTRINSIC_2': 56,
- 'CALL_KW': 57,
- 'COMPARE_OP': 58,
- 'CONTAINS_OP': 59,
- 'CONVERT_VALUE': 60,
- 'COPY': 61,
- 'COPY_FREE_VARS': 62,
- 'DELETE_ATTR': 63,
- 'DELETE_DEREF': 64,
- 'DELETE_FAST': 65,
- 'DELETE_GLOBAL': 66,
- 'DELETE_NAME': 67,
- 'DICT_MERGE': 68,
- 'DICT_UPDATE': 69,
- 'ENTER_EXECUTOR': 70,
- 'EXTENDED_ARG': 71,
- 'FOR_ITER': 72,
- 'GET_AWAITABLE': 73,
- 'IMPORT_FROM': 74,
- 'IMPORT_NAME': 75,
- 'IS_OP': 76,
- 'JUMP_BACKWARD': 77,
- 'JUMP_BACKWARD_NO_INTERRUPT': 78,
- 'JUMP_FORWARD': 79,
- 'LIST_APPEND': 80,
- 'LIST_EXTEND': 81,
- 'LOAD_ATTR': 82,
- 'LOAD_CONST': 83,
- 'LOAD_DEREF': 84,
- 'LOAD_FAST': 85,
- 'LOAD_FAST_AND_CLEAR': 86,
- 'LOAD_FAST_CHECK': 87,
- 'LOAD_FAST_LOAD_FAST': 88,
- 'LOAD_FROM_DICT_OR_DEREF': 89,
- 'LOAD_FROM_DICT_OR_GLOBALS': 90,
- 'LOAD_GLOBAL': 91,
- 'LOAD_NAME': 92,
- 'LOAD_SUPER_ATTR': 93,
- 'MAKE_CELL': 94,
- 'MAP_ADD': 95,
- 'MATCH_CLASS': 96,
- 'POP_JUMP_IF_FALSE': 97,
- 'POP_JUMP_IF_NONE': 98,
- 'POP_JUMP_IF_NOT_NONE': 99,
- 'POP_JUMP_IF_TRUE': 100,
- 'RAISE_VARARGS': 101,
- 'RERAISE': 102,
- 'RETURN_CONST': 103,
- 'SEND': 104,
- 'SET_ADD': 105,
- 'SET_FUNCTION_ATTRIBUTE': 106,
- 'SET_UPDATE': 107,
- 'STORE_ATTR': 108,
- 'STORE_DEREF': 109,
- 'STORE_FAST': 110,
- 'STORE_FAST_LOAD_FAST': 111,
- 'STORE_FAST_STORE_FAST': 112,
- 'STORE_GLOBAL': 113,
- 'STORE_NAME': 114,
- 'SWAP': 115,
- 'UNPACK_EX': 116,
- 'UNPACK_SEQUENCE': 117,
- 'YIELD_VALUE': 118,
- 'INSTRUMENTED_RESUME': 236,
- 'INSTRUMENTED_END_FOR': 237,
- 'INSTRUMENTED_END_SEND': 238,
- 'INSTRUMENTED_RETURN_VALUE': 239,
- 'INSTRUMENTED_RETURN_CONST': 240,
- 'INSTRUMENTED_YIELD_VALUE': 241,
- 'INSTRUMENTED_LOAD_SUPER_ATTR': 242,
- 'INSTRUMENTED_FOR_ITER': 243,
- 'INSTRUMENTED_CALL': 244,
- 'INSTRUMENTED_CALL_KW': 245,
- 'INSTRUMENTED_CALL_FUNCTION_EX': 246,
- 'INSTRUMENTED_INSTRUCTION': 247,
- 'INSTRUMENTED_JUMP_FORWARD': 248,
- 'INSTRUMENTED_JUMP_BACKWARD': 249,
- 'INSTRUMENTED_POP_JUMP_IF_TRUE': 250,
- 'INSTRUMENTED_POP_JUMP_IF_FALSE': 251,
- 'INSTRUMENTED_POP_JUMP_IF_NONE': 252,
- 'INSTRUMENTED_POP_JUMP_IF_NOT_NONE': 253,
- 'JUMP': 256,
- 'JUMP_NO_INTERRUPT': 257,
- 'LOAD_CLOSURE': 258,
- 'LOAD_METHOD': 259,
- 'LOAD_SUPER_METHOD': 260,
- 'LOAD_ZERO_SUPER_ATTR': 261,
- 'LOAD_ZERO_SUPER_METHOD': 262,
- 'POP_BLOCK': 263,
- 'SETUP_CLEANUP': 264,
- 'SETUP_FINALLY': 265,
- 'SETUP_WITH': 266,
- 'STORE_FAST_MAYBE_NULL': 267,
-}
-
-HAVE_ARGUMENT = 44
-MIN_INSTRUMENTED_OPCODE = 236
diff --git a/python/python3_13/examples/_pylong.py b/python/python3_13/examples/_pylong.py
deleted file mode 100644
index 4970eb3fa6..0000000000
--- a/python/python3_13/examples/_pylong.py
+++ /dev/null
@@ -1,363 +0,0 @@
-"""Python implementations of some algorithms for use by longobject.c.
-The goal is to provide asymptotically faster algorithms that can be
-used for operations on integers with many digits. In those cases, the
-performance overhead of the Python implementation is not significant
-since the asymptotic behavior is what dominates runtime. Functions
-provided by this module should be considered private and not part of any
-public API.
-
-Note: for ease of maintainability, please prefer clear code and avoid
-"micro-optimizations". This module will only be imported and used for
-integers with a huge number of digits. Saving a few microseconds with
-tricky or non-obvious code is not worth it. For people looking for
-maximum performance, they should use something like gmpy2."""
-
-import re
-import decimal
-try:
- import _decimal
-except ImportError:
- _decimal = None
-
-# A number of functions have this form, where `w` is a desired number of
-# digits in base `base`:
-#
-# def inner(...w...):
-# if w <= LIMIT:
-# return something
-# lo = w >> 1
-# hi = w - lo
-# something involving base**lo, inner(...lo...), j, and inner(...hi...)
-# figure out largest w needed
-# result = inner(w)
-#
-# They all had some on-the-fly scheme to cache `base**lo` results for reuse.
-# Power is costly.
-#
-# This routine aims to compute all amd only the needed powers in advance, as
-# efficiently as reasonably possible. This isn't trivial, and all the
-# on-the-fly methods did needless work in many cases. The driving code above
-# changes to:
-#
-# figure out largest w needed
-# mycache = compute_powers(w, base, LIMIT)
-# result = inner(w)
-#
-# and `mycache[lo]` replaces `base**lo` in the inner function.
-#
-# While this does give minor speedups (a few percent at best), the primary
-# intent is to simplify the functions using this, by eliminating the need for
-# them to craft their own ad-hoc caching schemes.
-def compute_powers(w, base, more_than, show=False):
- seen = set()
- need = set()
- ws = {w}
- while ws:
- w = ws.pop() # any element is fine to use next
- if w in seen or w <= more_than:
- continue
- seen.add(w)
- lo = w >> 1
- # only _need_ lo here; some other path may, or may not, need hi
- need.add(lo)
- ws.add(lo)
- if w & 1:
- ws.add(lo + 1)
-
- d = {}
- if not need:
- return d
- it = iter(sorted(need))
- first = next(it)
- if show:
- print("pow at", first)
- d[first] = base ** first
- for this in it:
- if this - 1 in d:
- if show:
- print("* base at", this)
- d[this] = d[this - 1] * base # cheap
- else:
- lo = this >> 1
- hi = this - lo
- assert lo in d
- if show:
- print("square at", this)
- # Multiplying a bigint by itself (same object!) is about twice
- # as fast in CPython.
- sq = d[lo] * d[lo]
- if hi != lo:
- assert hi == lo + 1
- if show:
- print(" and * base")
- sq *= base
- d[this] = sq
- return d
-
-_unbounded_dec_context = decimal.getcontext().copy()
-_unbounded_dec_context.prec = decimal.MAX_PREC
-_unbounded_dec_context.Emax = decimal.MAX_EMAX
-_unbounded_dec_context.Emin = decimal.MIN_EMIN
-_unbounded_dec_context.traps[decimal.Inexact] = 1 # sanity check
-
-def int_to_decimal(n):
- """Asymptotically fast conversion of an 'int' to Decimal."""
-
- # Function due to Tim Peters. See GH issue #90716 for details.
- # https://github.com/python/cpython/issues/90716
- #
- # The implementation in longobject.c of base conversion algorithms
- # between power-of-2 and non-power-of-2 bases are quadratic time.
- # This function implements a divide-and-conquer algorithm that is
- # faster for large numbers. Builds an equal decimal.Decimal in a
- # "clever" recursive way. If we want a string representation, we
- # apply str to _that_.
-
- from decimal import Decimal as D
- BITLIM = 200
-
- # Don't bother caching the "lo" mask in this; the time to compute it is
- # tiny compared to the multiply.
- def inner(n, w):
- if w <= BITLIM:
- return D(n)
- w2 = w >> 1
- hi = n >> w2
- lo = n & ((1 << w2) - 1)
- return inner(lo, w2) + inner(hi, w - w2) * w2pow[w2]
-
- with decimal.localcontext(_unbounded_dec_context):
- nbits = n.bit_length()
- w2pow = compute_powers(nbits, D(2), BITLIM)
- if n < 0:
- negate = True
- n = -n
- else:
- negate = False
- result = inner(n, nbits)
- if negate:
- result = -result
- return result
-
-def int_to_decimal_string(n):
- """Asymptotically fast conversion of an 'int' to a decimal string."""
- w = n.bit_length()
- if w > 450_000 and _decimal is not None:
- # It is only usable with the C decimal implementation.
- # _pydecimal.py calls str() on very large integers, which in its
- # turn calls int_to_decimal_string(), causing very deep recursion.
- return str(int_to_decimal(n))
-
- # Fallback algorithm for the case when the C decimal module isn't
- # available. This algorithm is asymptotically worse than the algorithm
- # using the decimal module, but better than the quadratic time
- # implementation in longobject.c.
-
- DIGLIM = 1000
- def inner(n, w):
- if w <= DIGLIM:
- return str(n)
- w2 = w >> 1
- hi, lo = divmod(n, pow10[w2])
- return inner(hi, w - w2) + inner(lo, w2).zfill(w2)
-
- # The estimation of the number of decimal digits.
- # There is no harm in small error. If we guess too large, there may
- # be leading 0's that need to be stripped. If we guess too small, we
- # may need to call str() recursively for the remaining highest digits,
- # which can still potentially be a large integer. This is manifested
- # only if the number has way more than 10**15 digits, that exceeds
- # the 52-bit physical address limit in both Intel64 and AMD64.
- w = int(w * 0.3010299956639812 + 1) # log10(2)
- pow10 = compute_powers(w, 5, DIGLIM)
- for k, v in pow10.items():
- pow10[k] = v << k # 5**k << k == 5**k * 2**k == 10**k
- if n < 0:
- n = -n
- sign = '-'
- else:
- sign = ''
- s = inner(n, w)
- if s[0] == '0' and n:
- # If our guess of w is too large, there may be leading 0's that
- # need to be stripped.
- s = s.lstrip('0')
- return sign + s
-
-def _str_to_int_inner(s):
- """Asymptotically fast conversion of a 'str' to an 'int'."""
-
- # Function due to Bjorn Martinsson. See GH issue #90716 for details.
- # https://github.com/python/cpython/issues/90716
- #
- # The implementation in longobject.c of base conversion algorithms
- # between power-of-2 and non-power-of-2 bases are quadratic time.
- # This function implements a divide-and-conquer algorithm making use
- # of Python's built in big int multiplication. Since Python uses the
- # Karatsuba algorithm for multiplication, the time complexity
- # of this function is O(len(s)**1.58).
-
- DIGLIM = 2048
-
- def inner(a, b):
- if b - a <= DIGLIM:
- return int(s[a:b])
- mid = (a + b + 1) >> 1
- return (inner(mid, b)
- + ((inner(a, mid) * w5pow[b - mid])
- << (b - mid)))
-
- w5pow = compute_powers(len(s), 5, DIGLIM)
- return inner(0, len(s))
-
-
-def int_from_string(s):
- """Asymptotically fast version of PyLong_FromString(), conversion
- of a string of decimal digits into an 'int'."""
- # PyLong_FromString() has already removed leading +/-, checked for invalid
- # use of underscore characters, checked that string consists of only digits
- # and underscores, and stripped leading whitespace. The input can still
- # contain underscores and have trailing whitespace.
- s = s.rstrip().replace('_', '')
- return _str_to_int_inner(s)
-
-def str_to_int(s):
- """Asymptotically fast version of decimal string to 'int' conversion."""
- # FIXME: this doesn't support the full syntax that int() supports.
- m = re.match(r'\s*([+-]?)([0-9_]+)\s*', s)
- if not m:
- raise ValueError('invalid literal for int() with base 10')
- v = int_from_string(m.group(2))
- if m.group(1) == '-':
- v = -v
- return v
-
-
-# Fast integer division, based on code from Mark Dickinson, fast_div.py
-# GH-47701. Additional refinements and optimizations by Bjorn Martinsson. The
-# algorithm is due to Burnikel and Ziegler, in their paper "Fast Recursive
-# Division".
-
-_DIV_LIMIT = 4000
-
-
-def _div2n1n(a, b, n):
- """Divide a 2n-bit nonnegative integer a by an n-bit positive integer
- b, using a recursive divide-and-conquer algorithm.
-
- Inputs:
- n is a positive integer
- b is a positive integer with exactly n bits
- a is a nonnegative integer such that a < 2**n * b
-
- Output:
- (q, r) such that a = b*q+r and 0 <= r < b.
-
- """
- if a.bit_length() - n <= _DIV_LIMIT:
- return divmod(a, b)
- pad = n & 1
- if pad:
- a <<= 1
- b <<= 1
- n += 1
- half_n = n >> 1
- mask = (1 << half_n) - 1
- b1, b2 = b >> half_n, b & mask
- q1, r = _div3n2n(a >> n, (a >> half_n) & mask, b, b1, b2, half_n)
- q2, r = _div3n2n(r, a & mask, b, b1, b2, half_n)
- if pad:
- r >>= 1
- return q1 << half_n | q2, r
-
-
-def _div3n2n(a12, a3, b, b1, b2, n):
- """Helper function for _div2n1n; not intended to be called directly."""
- if a12 >> n == b1:
- q, r = (1 << n) - 1, a12 - (b1 << n) + b1
- else:
- q, r = _div2n1n(a12, b1, n)
- r = (r << n | a3) - q * b2
- while r < 0:
- q -= 1
- r += b
- return q, r
-
-
-def _int2digits(a, n):
- """Decompose non-negative int a into base 2**n
-
- Input:
- a is a non-negative integer
-
- Output:
- List of the digits of a in base 2**n in little-endian order,
- meaning the most significant digit is last. The most
- significant digit is guaranteed to be non-zero.
- If a is 0 then the output is an empty list.
-
- """
- a_digits = [0] * ((a.bit_length() + n - 1) // n)
-
- def inner(x, L, R):
- if L + 1 == R:
- a_digits[L] = x
- return
- mid = (L + R) >> 1
- shift = (mid - L) * n
- upper = x >> shift
- lower = x ^ (upper << shift)
- inner(lower, L, mid)
- inner(upper, mid, R)
-
- if a:
- inner(a, 0, len(a_digits))
- return a_digits
-
-
-def _digits2int(digits, n):
- """Combine base-2**n digits into an int. This function is the
- inverse of `_int2digits`. For more details, see _int2digits.
- """
-
- def inner(L, R):
- if L + 1 == R:
- return digits[L]
- mid = (L + R) >> 1
- shift = (mid - L) * n
- return (inner(mid, R) << shift) + inner(L, mid)
-
- return inner(0, len(digits)) if digits else 0
-
-
-def _divmod_pos(a, b):
- """Divide a non-negative integer a by a positive integer b, giving
- quotient and remainder."""
- # Use grade-school algorithm in base 2**n, n = nbits(b)
- n = b.bit_length()
- a_digits = _int2digits(a, n)
-
- r = 0
- q_digits = []
- for a_digit in reversed(a_digits):
- q_digit, r = _div2n1n((r << n) + a_digit, b, n)
- q_digits.append(q_digit)
- q_digits.reverse()
- q = _digits2int(q_digits, n)
- return q, r
-
-
-def int_divmod(a, b):
- """Asymptotically fast replacement for divmod, for 'int'.
- Its time complexity is O(n**1.58), where n = #bits(a) + #bits(b).
- """
- if b == 0:
- raise ZeroDivisionError
- elif b < 0:
- q, r = int_divmod(-a, -b)
- return q, -r
- elif a < 0:
- q, r = int_divmod(~a, b)
- return ~q, b + ~r
- else:
- return _divmod_pos(a, b)
diff --git a/python/python3_13/examples/_threading_local.py b/python/python3_13/examples/_threading_local.py
deleted file mode 100644
index b006d76c4e..0000000000
--- a/python/python3_13/examples/_threading_local.py
+++ /dev/null
@@ -1,242 +0,0 @@
-"""Thread-local objects.
-
-(Note that this module provides a Python version of the threading.local
- class. Depending on the version of Python you're using, there may be a
- faster one available. You should always import the `local` class from
- `threading`.)
-
-Thread-local objects support the management of thread-local data.
-If you have data that you want to be local to a thread, simply create
-a thread-local object and use its attributes:
-
- >>> mydata = local()
- >>> mydata.number = 42
- >>> mydata.number
- 42
-
-You can also access the local-object's dictionary:
-
- >>> mydata.__dict__
- {'number': 42}
- >>> mydata.__dict__.setdefault('widgets', [])
- []
- >>> mydata.widgets
- []
-
-What's important about thread-local objects is that their data are
-local to a thread. If we access the data in a different thread:
-
- >>> log = []
- >>> def f():
- ... items = sorted(mydata.__dict__.items())
- ... log.append(items)
- ... mydata.number = 11
- ... log.append(mydata.number)
-
- >>> import threading
- >>> thread = threading.Thread(target=f)
- >>> thread.start()
- >>> thread.join()
- >>> log
- [[], 11]
-
-we get different data. Furthermore, changes made in the other thread
-don't affect data seen in this thread:
-
- >>> mydata.number
- 42
-
-Of course, values you get from a local object, including a __dict__
-attribute, are for whatever thread was current at the time the
-attribute was read. For that reason, you generally don't want to save
-these values across threads, as they apply only to the thread they
-came from.
-
-You can create custom local objects by subclassing the local class:
-
- >>> class MyLocal(local):
- ... number = 2
- ... def __init__(self, /, **kw):
- ... self.__dict__.update(kw)
- ... def squared(self):
- ... return self.number ** 2
-
-This can be useful to support default values, methods and
-initialization. Note that if you define an __init__ method, it will be
-called each time the local object is used in a separate thread. This
-is necessary to initialize each thread's dictionary.
-
-Now if we create a local object:
-
- >>> mydata = MyLocal(color='red')
-
-Now we have a default number:
-
- >>> mydata.number
- 2
-
-an initial color:
-
- >>> mydata.color
- 'red'
- >>> del mydata.color
-
-And a method that operates on the data:
-
- >>> mydata.squared()
- 4
-
-As before, we can access the data in a separate thread:
-
- >>> log = []
- >>> thread = threading.Thread(target=f)
- >>> thread.start()
- >>> thread.join()
- >>> log
- [[('color', 'red')], 11]
-
-without affecting this thread's data:
-
- >>> mydata.number
- 2
- >>> mydata.color
- Traceback (most recent call last):
- ...
- AttributeError: 'MyLocal' object has no attribute 'color'
-
-Note that subclasses can define slots, but they are not thread
-local. They are shared across threads:
-
- >>> class MyLocal(local):
- ... __slots__ = 'number'
-
- >>> mydata = MyLocal()
- >>> mydata.number = 42
- >>> mydata.color = 'red'
-
-So, the separate thread:
-
- >>> thread = threading.Thread(target=f)
- >>> thread.start()
- >>> thread.join()
-
-affects what we see:
-
- >>> mydata.number
- 11
-
->>> del mydata
-"""
-
-from weakref import ref
-from contextlib import contextmanager
-
-__all__ = ["local"]
-
-# We need to use objects from the threading module, but the threading
-# module may also want to use our `local` class, if support for locals
-# isn't compiled in to the `thread` module. This creates potential problems
-# with circular imports. For that reason, we don't import `threading`
-# until the bottom of this file (a hack sufficient to worm around the
-# potential problems). Note that all platforms on CPython do have support
-# for locals in the `thread` module, and there is no circular import problem
-# then, so problems introduced by fiddling the order of imports here won't
-# manifest.
-
-class _localimpl:
- """A class managing thread-local dicts"""
- __slots__ = 'key', 'dicts', 'localargs', 'locallock', '__weakref__'
-
- def __init__(self):
- # The key used in the Thread objects' attribute dicts.
- # We keep it a string for speed but make it unlikely to clash with
- # a "real" attribute.
- self.key = '_threading_local._localimpl.' + str(id(self))
- # { id(Thread) -> (ref(Thread), thread-local dict) }
- self.dicts = {}
-
- def get_dict(self):
- """Return the dict for the current thread. Raises KeyError if none
- defined."""
- thread = current_thread()
- return self.dicts[id(thread)][1]
-
- def create_dict(self):
- """Create a new dict for the current thread, and return it."""
- localdict = {}
- key = self.key
- thread = current_thread()
- idt = id(thread)
- def local_deleted(_, key=key):
- # When the localimpl is deleted, remove the thread attribute.
- thread = wrthread()
- if thread is not None:
- del thread.__dict__[key]
- def thread_deleted(_, idt=idt):
- # When the thread is deleted, remove the local dict.
- # Note that this is suboptimal if the thread object gets
- # caught in a reference loop. We would like to be called
- # as soon as the OS-level thread ends instead.
- local = wrlocal()
- if local is not None:
- dct = local.dicts.pop(idt)
- wrlocal = ref(self, local_deleted)
- wrthread = ref(thread, thread_deleted)
- thread.__dict__[key] = wrlocal
- self.dicts[idt] = wrthread, localdict
- return localdict
-
-
-@contextmanager
-def _patch(self):
- impl = object.__getattribute__(self, '_local__impl')
- try:
- dct = impl.get_dict()
- except KeyError:
- dct = impl.create_dict()
- args, kw = impl.localargs
- self.__init__(*args, **kw)
- with impl.locallock:
- object.__setattr__(self, '__dict__', dct)
- yield
-
-
-class local:
- __slots__ = '_local__impl', '__dict__'
-
- def __new__(cls, /, *args, **kw):
- if (args or kw) and (cls.__init__ is object.__init__):
- raise TypeError("Initialization arguments are not supported")
- self = object.__new__(cls)
- impl = _localimpl()
- impl.localargs = (args, kw)
- impl.locallock = RLock()
- object.__setattr__(self, '_local__impl', impl)
- # We need to create the thread dict in anticipation of
- # __init__ being called, to make sure we don't call it
- # again ourselves.
- impl.create_dict()
- return self
-
- def __getattribute__(self, name):
- with _patch(self):
- return object.__getattribute__(self, name)
-
- def __setattr__(self, name, value):
- if name == '__dict__':
- raise AttributeError(
- "%r object attribute '__dict__' is read-only"
- % self.__class__.__name__)
- with _patch(self):
- return object.__setattr__(self, name, value)
-
- def __delattr__(self, name):
- if name == '__dict__':
- raise AttributeError(
- "%r object attribute '__dict__' is read-only"
- % self.__class__.__name__)
- with _patch(self):
- return object.__delattr__(self, name)
-
-
-from threading import current_thread, RLock
diff --git a/python/python3_13/examples/_weakrefset.py b/python/python3_13/examples/_weakrefset.py
deleted file mode 100644
index 489eec714e..0000000000
--- a/python/python3_13/examples/_weakrefset.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Access WeakSet through the weakref module.
-# This code is separated-out because it is needed
-# by abc.py to load everything else at startup.
-
-from _weakref import ref
-from types import GenericAlias
-
-__all__ = ['WeakSet']
-
-
-class _IterationGuard:
- # This context manager registers itself in the current iterators of the
- # weak container, such as to delay all removals until the context manager
- # exits.
- # This technique should be relatively thread-safe (since sets are).
-
- def __init__(self, weakcontainer):
- # Don't create cycles
- self.weakcontainer = ref(weakcontainer)
-
- def __enter__(self):
- w = self.weakcontainer()
- if w is not None:
- w._iterating.add(self)
- return self
-
- def __exit__(self, e, t, b):
- w = self.weakcontainer()
- if w is not None:
- s = w._iterating
- s.remove(self)
- if not s:
- w._commit_removals()
-
-
-class WeakSet:
- def __init__(self, data=None):
- self.data = set()
- def _remove(item, selfref=ref(self)):
- self = selfref()
- if self is not None:
- if self._iterating:
- self._pending_removals.append(item)
- else:
- self.data.discard(item)
- self._remove = _remove
- # A list of keys to be removed
- self._pending_removals = []
- self._iterating = set()
- if data is not None:
- self.update(data)
-
- def _commit_removals(self):
- pop = self._pending_removals.pop
- discard = self.data.discard
- while True:
- try:
- item = pop()
- except IndexError:
- return
- discard(item)
-
- def __iter__(self):
- with _IterationGuard(self):
- for itemref in self.data:
- item = itemref()
- if item is not None:
- # Caveat: the iterator will keep a strong reference to
- # `item` until it is resumed or closed.
- yield item
-
- def __len__(self):
- return len(self.data) - len(self._pending_removals)
-
- def __contains__(self, item):
- try:
- wr = ref(item)
- except TypeError:
- return False
- return wr in self.data
-
- def __reduce__(self):
- return self.__class__, (list(self),), self.__getstate__()
-
- def add(self, item):
- if self._pending_removals:
- self._commit_removals()
- self.data.add(ref(item, self._remove))
-
- def clear(self):
- if self._pending_removals:
- self._commit_removals()
- self.data.clear()
-
- def copy(self):
- return self.__class__(self)
-
- def pop(self):
- if self._pending_removals:
- self._commit_removals()
- while True:
- try:
- itemref = self.data.pop()
- except KeyError:
- raise KeyError('pop from empty WeakSet') from None
- item = itemref()
- if item is not None:
- return item
-
- def remove(self, item):
- if self._pending_removals:
- self._commit_removals()
- self.data.remove(ref(item))
-
- def discard(self, item):
- if self._pending_removals:
- self._commit_removals()
- self.data.discard(ref(item))
-
- def update(self, other):
- if self._pending_removals:
- self._commit_removals()
- for element in other:
- self.add(element)
-
- def __ior__(self, other):
- self.update(other)
- return self
-
- def difference(self, other):
- newset = self.copy()
- newset.difference_update(other)
- return newset
- __sub__ = difference
-
- def difference_update(self, other):
- self.__isub__(other)
- def __isub__(self, other):
- if self._pending_removals:
- self._commit_removals()
- if self is other:
- self.data.clear()
- else:
- self.data.difference_update(ref(item) for item in other)
- return self
-
- def intersection(self, other):
- return self.__class__(item for item in other if item in self)
- __and__ = intersection
-
- def intersection_update(self, other):
- self.__iand__(other)
- def __iand__(self, other):
- if self._pending_removals:
- self._commit_removals()
- self.data.intersection_update(ref(item) for item in other)
- return self
-
- def issubset(self, other):
- return self.data.issubset(ref(item) for item in other)
- __le__ = issubset
-
- def __lt__(self, other):
- return self.data < set(map(ref, other))
-
- def issuperset(self, other):
- return self.data.issuperset(ref(item) for item in other)
- __ge__ = issuperset
-
- def __gt__(self, other):
- return self.data > set(map(ref, other))
-
- def __eq__(self, other):
- if not isinstance(other, self.__class__):
- return NotImplemented
- return self.data == set(map(ref, other))
-
- def symmetric_difference(self, other):
- newset = self.copy()
- newset.symmetric_difference_update(other)
- return newset
- __xor__ = symmetric_difference
-
- def symmetric_difference_update(self, other):
- self.__ixor__(other)
- def __ixor__(self, other):
- if self._pending_removals:
- self._commit_removals()
- if self is other:
- self.data.clear()
- else:
- self.data.symmetric_difference_update(ref(item, self._remove) for item in other)
- return self
-
- def union(self, other):
- return self.__class__(e for s in (self, other) for e in s)
- __or__ = union
-
- def isdisjoint(self, other):
- return len(self.intersection(other)) == 0
-
- def __repr__(self):
- return repr(self.data)
-
- __class_getitem__ = classmethod(GenericAlias)
diff --git a/python/python3_14/CSharp/PythonLexerBase.cs b/python/python3_14/CSharp/PythonLexerBase.cs
new file mode 100644
index 0000000000..0b3ae6b28e
--- /dev/null
+++ b/python/python3_14/CSharp/PythonLexerBase.cs
@@ -0,0 +1,881 @@
+/*
+The MIT License (MIT)
+Copyright (c) 2021 Robert Einhorn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+ */
+
+/*
+ * Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation,
+ * interpolated strings, and encoding declaration.
+ *
+ * Developed by : Robert Einhorn
+ */
+
+using Antlr4.Runtime;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+
+[assembly: CLSCompliant(true)]
+
+public abstract class PythonLexerBase : Lexer
+{
+ private static readonly Dictionary LEXER_MODES_FOR_ISTRING_START = [];
+
+ private const int INVALID_LENGTH = -1;
+ private const string ERR_TXT = " ERROR: ";
+ private const int TAB_LENGTH = 8;
+
+ private string encodingName = "";
+
+ // Indentation handling
+ private Stack indentLengthStack = new();
+ private LinkedList pendingTokens = new();
+
+ private int previousPendingTokenType;
+ private int lastPendingTokenTypeFromDefaultChannel;
+
+ // Parenthesis / bracket / brace counts
+ private int opened;
+ private Stack paren_or_bracket_openedStack = new();
+ private Stack braceExpressionStack = new();
+ private string prevBraceExpression = "";
+
+ // Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE)
+ private int curISTRING_MIDDLEtokenType;
+
+ // We reimplement mode/stack because not all runtimes expose _mode/_modeStack
+ private int curLexerMode;
+ private Stack lexerModeStack = new();
+
+ // Indentation diagnostics
+ private bool wasSpaceIndentation;
+ private bool wasTabIndentation;
+ private bool wasIndentationMixedWithSpacesAndTabs;
+
+ // Current / lookahead tokens
+ private IToken curToken = null!;
+ private IToken ffgToken = null!;
+
+ protected PythonLexerBase(ICharStream input)
+ : this(input, Console.Out, Console.Error) { }
+
+ protected PythonLexerBase(ICharStream input, TextWriter output, TextWriter errorOutput)
+ : base(input, output, errorOutput) { }
+
+ public override void Reset()
+ {
+ this.Init();
+ base.Reset();
+ }
+
+ private void Init()
+ {
+ this.encodingName = "";
+ this.indentLengthStack = new();
+ this.pendingTokens = new();
+ this.previousPendingTokenType = 0;
+ this.lastPendingTokenTypeFromDefaultChannel = 0;
+ this.opened = 0;
+ this.paren_or_bracket_openedStack = new();
+ this.braceExpressionStack = new();
+ this.prevBraceExpression = "";
+ this.curISTRING_MIDDLEtokenType = 0;
+ this.curLexerMode = Lexer.DEFAULT_MODE;
+ this.lexerModeStack = new();
+ this.wasSpaceIndentation = false;
+ this.wasTabIndentation = false;
+ this.wasIndentationMixedWithSpacesAndTabs = false;
+ this.curToken = null!;
+ this.ffgToken = null!;
+ }
+
+ ///
+ /// Sets the encoding name to emit an ENCODING token at the start of the token stream.
+ /// Leave empty if not needed (e.g., when parsing from string).
+ ///
+ ///
+ /// The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token.
+ ///
+ public void SetEncodingName(string encodingName)
+ {
+ this.encodingName = encodingName;
+ }
+
+ public override IToken NextToken() // Reading the input stream until EOF is reached
+ {
+ this.CheckNextToken();
+ IToken firstPendingToken = this.pendingTokens.First!.Value;
+ this.pendingTokens.RemoveFirst();
+ return firstPendingToken; // Add the queued token to the token stream
+ }
+
+ private void CheckNextToken()
+ {
+ if (this.previousPendingTokenType == TokenConstants.EOF)
+ return;
+
+ this.SetCurrentAndFollowingTokens();
+ if (this.indentLengthStack.Count == 0) // We're at the first token
+ {
+ this.HandleStartOfInput();
+ }
+
+ switch (this.curToken.Type)
+ {
+ case PythonLexer.NEWLINE:
+ this.HandleNEWLINEtoken();
+ break;
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ case PythonLexer.LBRACE:
+ this.opened++;
+ this.AddPendingToken(this.curToken);
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ case PythonLexer.RBRACE:
+ this.opened--;
+ this.AddPendingToken(this.curToken);
+ break;
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ this.HandleISTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field
+ this.AddPendingToken(this.curToken);
+ break;
+ case PythonLexer.COLONEQUAL:
+ this.HandleCOLONEQUALtokenInIString();
+ break;
+ case PythonLexer.ERRORTOKEN:
+ ReportLexerError($"token recognition error at: '{curToken.Text}'");
+ this.AddPendingToken(this.curToken);
+ break;
+ case TokenConstants.EOF:
+ this.HandleEOFtoken();
+ break;
+ default:
+ this.AddPendingToken(this.curToken);
+ break;
+ }
+ this.HandleFORMAT_SPECIFICATION_MODE();
+ }
+
+ private void SetCurrentAndFollowingTokens()
+ {
+ this.curToken = this.ffgToken == null ?
+ base.NextToken() :
+ this.ffgToken;
+
+ this.CheckCurToken(); // Do not use ffgToken in this method or any of its submethods — it hasn't been set yet!
+
+ this.ffgToken = this.curToken.Type == TokenConstants.EOF ?
+ this.curToken :
+ base.NextToken();
+ }
+
+ // - initialize indent stack
+ // - skip BOM token
+ // - insert ENCODING token (if any)
+ // - hide leading NEWLINE(s)
+ // - insert leading INDENT if first statement is indented
+ private void HandleStartOfInput()
+ {
+ this.indentLengthStack.Push(0); // this will never be popped off
+
+ if (this.curToken.Type == PythonLexer.BOM)
+ {
+ this.SetCurrentAndFollowingTokens();
+ }
+ this.InsertENCODINGtoken();
+
+ while (this.curToken.Type != TokenConstants.EOF)
+ {
+ if (this.curToken.Channel == TokenConstants.DefaultChannel)
+ {
+ if (this.curToken.Type == PythonLexer.NEWLINE)
+ {
+ // all the NEWLINE tokens must be ignored before the first statement
+ this.HideAndAddPendingToken(this.curToken);
+ }
+ else
+ { // We're at the first statement
+ this.InsertLeadingIndentToken();
+ return; // continue the processing of the current token with CheckNextToken()
+ }
+ }
+ else
+ {
+ this.AddPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING, or COMMENT token
+ }
+ this.SetCurrentAndFollowingTokens();
+ } // continue the processing of the EOF token with CheckNextToken()
+ }
+
+ private void InsertENCODINGtoken() // https://peps.python.org/pep-0263/
+ {
+ if (this.encodingName == "") return;
+
+ var sourcePair = new Tuple(this, (ICharStream)this.InputStream);
+ var encodingToken = new CommonToken(sourcePair, PythonLexer.ENCODING, TokenConstants.HiddenChannel, start: 0, stop: 0);
+ encodingToken.Text = this.encodingName;
+ encodingToken.Line = 0;
+ encodingToken.Column = -1;
+ AddPendingToken(encodingToken);
+ }
+
+ private void InsertLeadingIndentToken()
+ {
+ if (this.previousPendingTokenType == PythonLexer.WS)
+ {
+ var prevToken = this.pendingTokens.Last!.Value;
+ if (this.GetIndentationLength(prevToken.Text) != 0) // there is an "indentation" before the first statement
+ {
+ const string errMsg = "first statement indented";
+ this.ReportLexerError(errMsg);
+ // insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser
+ this.CreateAndAddPendingToken(PythonLexer.INDENT, PythonLexerBase.ERR_TXT + errMsg, this.curToken);
+ }
+ }
+ }
+
+ private void HandleNEWLINEtoken()
+ {
+ if (this.lexerModeStack.Count > 0) // for multi line f/t-string literals
+ {
+ this.AddPendingToken(this.curToken);
+ return;
+ }
+
+ if (this.opened > 0)
+ {
+ // We're in an implicit line joining, ignore the current NEWLINE token
+ this.HideAndAddPendingToken(this.curToken);
+ return;
+ }
+
+ var nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
+ var isLookingAhead = this.ffgToken.Type == PythonLexer.WS;
+ if (isLookingAhead)
+ {
+ this.SetCurrentAndFollowingTokens(); // set the next two tokens
+ }
+
+ switch (this.ffgToken.Type)
+ {
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
+ this.HideAndAddPendingToken(nlToken);
+ if (isLookingAhead)
+ {
+ this.AddPendingToken(this.curToken); // WS token
+ }
+ break;
+ default:
+ this.AddPendingToken(nlToken);
+ if (isLookingAhead)
+ { // We're on a whitespace(s) followed by a statement
+ var indentationLength = this.ffgToken.Type == TokenConstants.EOF ?
+ 0 :
+ this.GetIndentationLength(this.curToken.Text);
+
+ if (indentationLength != PythonLexerBase.INVALID_LENGTH)
+ {
+ this.AddPendingToken(this.curToken); // WS token
+ this.InsertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
+ }
+ else
+ {
+ this.ReportError("inconsistent use of tabs and spaces in indentation");
+ }
+ }
+ else
+ {
+ // We're at a newline followed by a statement (there is no whitespace before the statement)
+ this.InsertIndentOrDedentToken(0); // may insert DEDENT token(s)
+ }
+ break;
+ }
+ }
+
+ private void InsertIndentOrDedentToken(int indentLength)
+ {
+ var prevIndentLength = this.indentLengthStack.Peek();
+ if (indentLength > prevIndentLength)
+ {
+ this.CreateAndAddPendingToken(PythonLexer.INDENT, null, this.ffgToken);
+ this.indentLengthStack.Push(indentLength);
+ return;
+ }
+
+ while (indentLength < prevIndentLength)
+ { // more than 1 DEDENT token may be inserted into the token stream
+ this.indentLengthStack.Pop();
+ prevIndentLength = this.indentLengthStack.Peek();
+ if (indentLength <= prevIndentLength)
+ {
+ this.CreateAndAddPendingToken(PythonLexer.DEDENT, null, this.ffgToken);
+ }
+ else
+ {
+ this.ReportError("inconsistent dedent");
+ }
+ }
+ }
+
+ private void CheckCurToken()
+ {
+ switch (this.curToken.Type)
+ {
+ case PythonLexer.FSTRING_START:
+ this.curISTRING_MIDDLEtokenType = PythonLexer.FSTRING_MIDDLE;
+ this.SetLexerModeByISTRING_STARTtoken();
+ return;
+ case PythonLexer.TSTRING_START:
+ this.curISTRING_MIDDLEtokenType = PythonLexer.TSTRING_MIDDLE;
+ this.SetLexerModeByISTRING_STARTtoken();
+ return;
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ this.HandleISTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field
+ switch (this.curToken.Type)
+ {
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ return; // No curToken exchange happened
+ }
+ break;
+ case PythonLexer.FSTRING_END:
+ case PythonLexer.TSTRING_END:
+ this.PopLexerMode();
+ return;
+ default:
+ if (this.lexerModeStack.Count == 0)
+ {
+ return; // Not in f/t-string mode
+ }
+ break;
+ }
+ this.ProcessBraceExpression();
+ }
+
+ private void ProcessBraceExpression()
+ {
+ switch (this.curToken.Type) // the following tokens can only come from default mode (after an LBRACE in f/t-string)
+ {
+ case PythonLexer.NEWLINE:
+ // append the current brace expression with the current newline
+ this.AppendToBraceExpression(this.curToken.Text);
+ var nlToken = new CommonToken(this.curToken);
+ nlToken.Channel = TokenConstants.HiddenChannel;
+ this.curToken = nlToken;
+ break;
+ case PythonLexer.LBRACE:
+ // the outermost brace expression cannot be a dictionary comprehension or a set comprehension
+ this.braceExpressionStack.Push("{");
+ this.paren_or_bracket_openedStack.Push(0);
+ this.PushLexerMode(Lexer.DEFAULT_MODE);
+ break;
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ // append the current brace expression with a "(" or a "["
+ this.AppendToBraceExpression(this.curToken.Text);
+ // https://peps.python.org/pep-0498/#lambdas-inside-expressions
+ this.IncrementBraceStack();
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ // append the current brace expression with a ")" or a "]"
+ this.AppendToBraceExpression(this.curToken.Text);
+ this.DecrementBraceStack();
+ break;
+ case PythonLexer.COLON:
+ case PythonLexer.COLONEQUAL:
+ // append the current brace expression with a ":" or a ":="
+ this.AppendToBraceExpression(this.curToken.Text);
+ this.SetLexerModeByCOLONorCOLONEQUALtoken();
+ break;
+ case PythonLexer.RBRACE:
+ this.SetLexerModeAfterRBRACEtoken();
+ break;
+ default:
+ // append the current brace expression with the current token text
+ this.AppendToBraceExpression(this.curToken.Text);
+ break;
+ }
+ }
+
+ private void AppendToBraceExpression(string text)
+ {
+ var top = this.braceExpressionStack.Pop();
+ this.braceExpressionStack.Push(top + text);
+ }
+
+ private void IncrementBraceStack()
+ { // increment the last element
+ var top = this.paren_or_bracket_openedStack.Pop();
+ this.paren_or_bracket_openedStack.Push(top + 1);
+ }
+
+ private void DecrementBraceStack()
+ { // decrement the last element
+ var top = this.paren_or_bracket_openedStack.Pop();
+ this.paren_or_bracket_openedStack.Push(top - 1);
+ }
+
+ private void SetLexerModeAfterRBRACEtoken()
+ {
+ switch (this.curLexerMode)
+ {
+ case Lexer.DEFAULT_MODE:
+ this.PopLexerMode();
+ this.PopByBRACE();
+ break;
+ case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PopLexerMode();
+ this.PopLexerMode();
+ this.PopByBRACE();
+ break;
+ default:
+ this.ReportLexerError("f-string: single '}' is not allowed");
+ break;
+ }
+ }
+
+ private void SetLexerModeByISTRING_STARTtoken() // ISTRING = interpolated string (FSTRING or TSTRING)
+ {
+ if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.Count == 0)
+ {
+ PythonLexerBase.InitLexerModesForIStringStart();
+ }
+
+ var interpolatedStringPrefix = this.curToken.Text.ToLower();
+ if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.TryGetValue(interpolatedStringPrefix, out int newLexerMode))
+ {
+ this.PushLexerMode(newLexerMode);
+ }
+ else
+ {
+ this.ReportLexerError($"internal error: unknown interpolated string literal prefix: {this.curToken.Text}");
+ }
+ }
+
+ private static void InitLexerModesForIStringStart()
+ {
+ // f-strings
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["f'"] = PythonLexer.SQ1__FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rf'"] = PythonLexer.SQ1R_FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["fr'"] = PythonLexer.SQ1R_FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["f\""] = PythonLexer.DQ1__FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rf\""] = PythonLexer.DQ1R_FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["fr\""] = PythonLexer.DQ1R_FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["f'''"] = PythonLexer.SQ3__FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rf'''"] = PythonLexer.SQ3R_FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["fr'''"] = PythonLexer.SQ3R_FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["f\"\"\""] = PythonLexer.DQ3__FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rf\"\"\""] = PythonLexer.DQ3R_FSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["fr\"\"\""] = PythonLexer.DQ3R_FSTRING_MODE;
+
+ // t-strings
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["t'"] = PythonLexer.SQ1__TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rt'"] = PythonLexer.SQ1R_TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["tr'"] = PythonLexer.SQ1R_TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["t\""] = PythonLexer.DQ1__TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rt\""] = PythonLexer.DQ1R_TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["tr\""] = PythonLexer.DQ1R_TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["t'''"] = PythonLexer.SQ3__TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rt'''"] = PythonLexer.SQ3R_TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["tr'''"] = PythonLexer.SQ3R_TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["t\"\"\""] = PythonLexer.DQ3__TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["rt\"\"\""] = PythonLexer.DQ3R_TSTRING_MODE;
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START["tr\"\"\""] = PythonLexer.DQ3R_TSTRING_MODE;
+ }
+
+ private void SetLexerModeByCOLONorCOLONEQUALtoken()
+ {
+ // Exit early when the current lexer mode indicates an open parenthesis/bracket
+ if (this.paren_or_bracket_openedStack.Peek() != 0)
+ {
+ return;
+ }
+
+ // COLONEQUAL token will be replaced with a COLON token in CheckNextToken()
+ var prevLexerMode = this.lexerModeStack.Peek();
+ switch (prevLexerMode) // check the previous lexer mode (the current is DEFAULT_MODE)
+ {
+ case PythonLexer.SQ1__FSTRING_MODE:
+ case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.SQ1__TSTRING_MODE:
+ case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.SQ1R_FSTRING_MODE:
+ case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.SQ1R_TSTRING_MODE:
+ case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.DQ1__FSTRING_MODE:
+ case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.DQ1__TSTRING_MODE:
+ case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.DQ1R_FSTRING_MODE:
+ case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.DQ1R_TSTRING_MODE:
+ case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.SQ3__FSTRING_MODE:
+ case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.SQ3__TSTRING_MODE:
+ case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.SQ3R_FSTRING_MODE:
+ case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.SQ3R_TSTRING_MODE:
+ case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.DQ3__FSTRING_MODE:
+ case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.DQ3__TSTRING_MODE:
+ case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.DQ3R_FSTRING_MODE:
+ case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+
+ case PythonLexer.DQ3R_TSTRING_MODE:
+ case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.PushLexerMode(PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ }
+ }
+
+ private void PopByBRACE()
+ {
+ this.paren_or_bracket_openedStack.Pop();
+ var curBraceExpression = this.braceExpressionStack.Pop();
+ this.prevBraceExpression = curBraceExpression + "}";
+ if (this.braceExpressionStack.Count > 0)
+ {
+ // Extend the current brace expression by adding the previous expression
+ curBraceExpression = this.braceExpressionStack.Pop();
+ this.braceExpressionStack.Push(curBraceExpression + this.prevBraceExpression);
+ }
+ }
+
+ private void HandleISTRING_MIDDLEtokenWithDoubleBrace() // ISTRING = interpolated string (FSTRING or TSTRING)
+ {
+ // replace the trailing double brace with a single brace and insert a hidden brace token
+ var lastTwoChars = this.GetLastTwoCharsOfTheCurTokenText();
+ switch (lastTwoChars)
+ {
+ case "{{":
+ this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", TokenConstants.HiddenChannel);
+ break;
+ case "}}":
+ this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", TokenConstants.HiddenChannel);
+ break;
+ }
+ }
+
+ private void HandleISTRING_MIDDLEtokenWithQuoteAndLBrace() // ISTRING = interpolated string (FSTRING or TSTRING)
+ {
+ // replace the trailing quote + left_brace with a quote and insert an LBRACE token
+ // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
+ var lastTwoChars = this.GetLastTwoCharsOfTheCurTokenText();
+ switch (lastTwoChars)
+ {
+ case "\"{":
+ case "'{":
+ case "\\{":
+ this.TrimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", TokenConstants.DefaultChannel);
+ break;
+ }
+ }
+
+ private string GetLastTwoCharsOfTheCurTokenText()
+ {
+ var text = this.curToken.Text;
+ return text.Length >= 2 ? text[^2..] : text;
+
+ }
+
+ private void TrimLastCharAddPendingTokenSetCurToken(int type, string text, int channel)
+ {
+ // trim the last char and add the modified curToken to the pendingTokens stack
+ var curTokenText = this.curToken.Text;
+ var tokenTextWithoutLastChar = curTokenText[..^1];
+ var token = new CommonToken(this.curToken);
+ token.Text = tokenTextWithoutLastChar;
+ token.StopIndex -= 1;
+ this.AddPendingToken(token);
+
+ this.CreateNewCurToken(type, text, channel); // set curToken
+ }
+
+ private void HandleCOLONEQUALtokenInIString() // ISTRING = interpolated string (FSTRING or TSTRING)
+ {
+ if (this.lexerModeStack.Count > 0 &&
+ this.paren_or_bracket_openedStack.Peek() == 0)
+ {
+ // In an f/t-string, the walrus operator (:=) is only allowed inside parentheses.
+ // If used outside, split the COLONEQUAL token into a COLON
+ // (used as a format specifier instead of a walrus operator),
+ // and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE).
+ var colonequalToken = new CommonToken(this.curToken);
+ colonequalToken.Type = PythonLexer.COLON;
+ colonequalToken.Text = ":";
+ colonequalToken.StopIndex = colonequalToken.StartIndex;
+ this.curToken = colonequalToken;
+
+ switch (this.ffgToken.Type)
+ {
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ colonequalToken = new CommonToken(this.ffgToken);
+ colonequalToken.Text = "=" + colonequalToken.Text;
+ colonequalToken.StartIndex -= 1;
+ colonequalToken.Column -= 1;
+ this.ffgToken = colonequalToken;
+ break;
+ default:
+ this.AddPendingToken(this.curToken);
+ this.CreateNewCurToken(this.curISTRING_MIDDLEtokenType, "=", TokenConstants.DefaultChannel);
+ break;
+ }
+ }
+ this.AddPendingToken(this.curToken);
+ }
+
+ private void CreateNewCurToken(int type, string text, int channel)
+ {
+ var token = new CommonToken(this.curToken);
+ token.Type = type;
+ token.Text = text;
+ token.Channel = channel;
+ token.Column += 1;
+ token.StartIndex += 1;
+ token.StopIndex = token.StartIndex;
+ this.curToken = token;
+ }
+
+ private void PushLexerMode(int mode)
+ {
+ this.PushMode(mode);
+ this.lexerModeStack.Push(this.curLexerMode);
+ this.curLexerMode = mode;
+ }
+
+ private void PopLexerMode()
+ {
+ this.PopMode();
+ this.curLexerMode = this.lexerModeStack.Pop();
+ }
+
+ private void HandleFORMAT_SPECIFICATION_MODE()
+ {
+ if (this.lexerModeStack.Count == 0 || this.ffgToken.Type != PythonLexer.RBRACE)
+ {
+ return;
+ }
+
+ // insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification
+ switch (this.curToken.Type)
+ {
+ case PythonLexer.COLON:
+ this.CreateAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken);
+ break;
+ case PythonLexer.RBRACE:
+ // only when the previous brace expression is not a dictionary comprehension or set comprehension
+ if (!IsValid_DictionaryOrSet_ComprehensionExpression(this.prevBraceExpression))
+ {
+ this.CreateAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken);
+ }
+ break;
+ }
+ }
+
+ private static bool IsValid_DictionaryOrSet_ComprehensionExpression(string code)
+ {
+ var inputStream = CharStreams.fromString(code);
+ var lexer = new PythonLexer(inputStream);
+ var tokenStream = new CommonTokenStream(lexer);
+ var parser = new PythonParser(tokenStream);
+
+ // Disable error listeners to suppress console output
+ lexer.RemoveErrorListeners();
+ parser.RemoveErrorListeners();
+
+ parser.dictcomp(); // Try parsing as dictionary comprehension
+ if (parser.NumberOfSyntaxErrors == 0)
+ return true;
+
+ parser = new PythonParser(tokenStream);
+ tokenStream.Seek(0);
+ parser.RemoveErrorListeners();
+ parser.setcomp(); // Try parsing as set comprehension
+ return parser.NumberOfSyntaxErrors == 0;
+ }
+
+ private void InsertTrailingTokens()
+ {
+ switch (this.lastPendingTokenTypeFromDefaultChannel)
+ {
+ case PythonLexer.NEWLINE:
+ case PythonLexer.DEDENT:
+ break; // no trailing NEWLINE token is needed
+ default:
+ // insert an extra trailing NEWLINE token that serves as the end of the last statement
+ this.CreateAndAddPendingToken(PythonLexer.NEWLINE, null, this.ffgToken); // ffgToken is EOF
+ break;
+ }
+ this.InsertIndentOrDedentToken(0); // Now insert as many trailing DEDENT tokens as needed
+ }
+
+ private void HandleEOFtoken()
+ {
+ if (this.lastPendingTokenTypeFromDefaultChannel > 0)
+ { // there was a statement in the intStream (leading NEWLINE tokens are hidden)
+ this.InsertTrailingTokens();
+ }
+ this.AddPendingToken(this.curToken);
+ }
+
+ private void HideAndAddPendingToken(IToken originalToken)
+ {
+ var token = new CommonToken(originalToken);
+ token.Channel = TokenConstants.HiddenChannel;
+ this.AddPendingToken(token);
+ }
+
+ private void CreateAndAddPendingToken(int ttype, string? text, IToken originalToken)
+ {
+ var token = new CommonToken(originalToken);
+ token.Type = ttype;
+ token.Channel = TokenConstants.DefaultChannel;
+ token.StopIndex = originalToken.StartIndex - 1;
+ token.Text = text ?? "<" + this.Vocabulary.GetSymbolicName(ttype) + ">";
+
+ this.AddPendingToken(token);
+ }
+
+ private void AddPendingToken(IToken token)
+ {
+ // save the last pending token type because the pendingTokens list can be empty by the nextToken()
+ this.previousPendingTokenType = token.Type;
+ if (token.Channel == TokenConstants.DefaultChannel)
+ {
+ this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
+ }
+ this.pendingTokens.AddLast(token);
+ }
+
+ private int GetIndentationLength(string indentText) // the indentText may contain spaces, tabs or form feeds
+ {
+ var length = 0;
+ foreach (char ch in indentText)
+ {
+ switch (ch)
+ {
+ case ' ':
+ this.wasSpaceIndentation = true;
+ length += 1;
+ break;
+ case '\t':
+ this.wasTabIndentation = true;
+ length += PythonLexerBase.TAB_LENGTH - (length % PythonLexerBase.TAB_LENGTH);
+ break;
+ case '\f': // form feed
+ length = 0;
+ break;
+ }
+ }
+
+ if (this.wasTabIndentation && this.wasSpaceIndentation)
+ {
+ if (!this.wasIndentationMixedWithSpacesAndTabs)
+ {
+ this.wasIndentationMixedWithSpacesAndTabs = true;
+ length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent
+ }
+ }
+ return length;
+ }
+
+ private void ReportLexerError(string errMsg)
+ {
+ this.ErrorListenerDispatch.SyntaxError(this.ErrorOutput, this, this.curToken.Type, this.curToken.Line, this.curToken.Column, " LEXER" + PythonLexerBase.ERR_TXT + errMsg, null);
+ }
+
+ private void ReportError(string errMsg)
+ {
+ this.ReportLexerError(errMsg);
+ this.CreateAndAddPendingToken(PythonLexer.ERRORTOKEN, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken);
+ // the ERRORTOKEN also triggers a parser error
+ }
+}
diff --git a/python/python3_14/Java/PythonLexerBase.java b/python/python3_14/Java/PythonLexerBase.java
new file mode 100644
index 0000000000..ef8ccf7c12
--- /dev/null
+++ b/python/python3_14/Java/PythonLexerBase.java
@@ -0,0 +1,768 @@
+/*
+The MIT License (MIT)
+Copyright (c) 2021 Robert Einhorn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+ */
+
+/*
+ *
+ * Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation,
+ * interpolated strings, and encoding declaration.
+ *
+ * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com
+ *
+ */
+
+// **** Implemented in Java 8 for compatibility with ANTLR4 Java runtime ****
+
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.antlr.v4.runtime.*;
+import org.antlr.v4.runtime.misc.Pair;
+
+public abstract class PythonLexerBase extends Lexer {
+ private static final Map LEXER_MODES_FOR_ISTRING_START = new HashMap<>();
+
+ private static final int INVALID_LENGTH = -1;
+ private static final String ERR_TXT = " ERROR: ";
+ private static final int TAB_LENGTH = 8;
+
+ private String encodingName;
+
+ // Indentation handling
+ private Deque indentLengthStack;
+ private Deque pendingTokens;
+
+ private int previousPendingTokenType;
+ private int lastPendingTokenTypeFromDefaultChannel;
+
+ // Parenthesis / bracket / brace counts
+ private int opened;
+ private Deque parenOrBracketOpenedStack;
+ private Deque braceExpressionStack;
+ private String prevBraceExpression;
+
+ // Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE)
+ private int curISTRING_MIDDLEtokenType;
+
+ // We reimplement mode/stack because not all runtimes expose _mode/_modeStack
+ private int curLexerMode;
+ private Deque lexerModeStack;
+
+ // Indentation diagnostics
+ private boolean wasSpaceIndentation;
+ private boolean wasTabIndentation;
+ private boolean wasIndentationMixedWithSpacesAndTabs;
+
+ // Current / lookahead tokens
+ private Token curToken;
+ private Token ffgToken;
+
+ protected PythonLexerBase(CharStream input) {
+ super(input);
+ this.init();
+ }
+
+ @Override
+ public void reset() {
+ this.init();
+ super.reset();
+ }
+
+ private void init() {
+ this.encodingName = "";
+ this.indentLengthStack = new ArrayDeque<>();
+ this.pendingTokens = new ArrayDeque<>();
+ this.previousPendingTokenType = 0;
+ this.lastPendingTokenTypeFromDefaultChannel = 0;
+ this.opened = 0;
+ this.parenOrBracketOpenedStack = new ArrayDeque<>();
+ this.braceExpressionStack = new ArrayDeque<>();
+ this.prevBraceExpression = "";
+ this.curISTRING_MIDDLEtokenType = 0;
+ this.curLexerMode = Lexer.DEFAULT_MODE;
+ this.lexerModeStack = new ArrayDeque<>();
+ this.wasSpaceIndentation = false;
+ this.wasTabIndentation = false;
+ this.wasIndentationMixedWithSpacesAndTabs = false;
+ this.curToken = null;
+ this.ffgToken = null;
+ }
+
+ /**
+ * Sets the encoding name to emit an ENCODING token at the start of the token stream.
+ * Leave empty if not needed (e.g., when parsing from string).
+ *
+ * @param encodingName the encoding name (e.g., "utf-8"), or empty string to disable ENCODING token
+ */
+ public void setEncodingName(final String encodingName) {
+ this.encodingName = encodingName;
+ }
+
+ @Override
+ public Token nextToken() { // Reading the input stream until EOF is reached
+ this.checkNextToken();
+ return this.pendingTokens.pollFirst(); // Add the queued token to the token stream
+ }
+
+ private void checkNextToken() {
+ if (this.previousPendingTokenType == Token.EOF) return;
+
+ this.setCurrentAndFollowingTokens();
+ if (this.indentLengthStack.isEmpty()) { // We're at the first token
+ this.handleStartOfInput();
+ }
+
+ switch (this.curToken.getType()) {
+ case PythonLexer.NEWLINE:
+ this.handleNEWLINEtoken();
+ break;
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ case PythonLexer.LBRACE:
+ this.opened++;
+ this.addPendingToken(this.curToken);
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ case PythonLexer.RBRACE:
+ this.opened--;
+ this.addPendingToken(this.curToken);
+ break;
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ this.handleISTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field
+ this.addPendingToken(this.curToken);
+ break;
+ case PythonLexer.COLONEQUAL:
+ this.handleCOLONEQUALtokenInIString();
+ break;
+ case PythonLexer.ERRORTOKEN:
+ this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'");
+ this.addPendingToken(this.curToken);
+ break;
+ case Token.EOF:
+ this.handleEOFtoken();
+ break;
+ default:
+ this.addPendingToken(this.curToken);
+ }
+ this.handleFORMAT_SPECIFICATION_MODE();
+ }
+
+ private void setCurrentAndFollowingTokens() {
+ this.curToken = this.ffgToken == null ?
+ super.nextToken() :
+ this.ffgToken;
+
+ this.checkCurToken(); // Do not use ffgToken in this method or any of its submethods — it hasn't been set yet!
+
+ this.ffgToken = this.curToken.getType() == Token.EOF ?
+ this.curToken :
+ super.nextToken();
+ }
+
+ // - initialize indent stack
+ // - skip BOM token
+ // - insert ENCODING token (if any)
+ // - hide leading NEWLINE(s)
+ // - insert leading INDENT if first statement is indented
+ private void handleStartOfInput() {
+ this.indentLengthStack.push(0); // this will never be popped off
+
+ if (this.curToken.getType() == PythonLexer.BOM) {
+ this.setCurrentAndFollowingTokens();
+ }
+ this.insertENCODINGtoken();
+
+ while (this.curToken.getType() != Token.EOF) {
+ if (this.curToken.getChannel() == Token.DEFAULT_CHANNEL) {
+ if (this.curToken.getType() == PythonLexer.NEWLINE) {
+ // all the NEWLINE tokens must be ignored before the first statement
+ this.hideAndAddPendingToken(this.curToken);
+ } else { // We're at the first statement
+ this.insertLeadingIndentToken();
+ return; // continue the processing of the current token with checkNextToken()
+ }
+ } else {
+ this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ }
+ this.setCurrentAndFollowingTokens();
+ }
+ // continue the processing of the EOF token with checkNextToken()
+ }
+
+ private void insertENCODINGtoken() { // https://peps.python.org/pep-0263/
+ if (this.encodingName.isEmpty()) return;
+
+ final Pair sourcePair = this._tokenFactorySourcePair;
+ final CommonToken encodingToken = new CommonToken(sourcePair, PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, 0, 0);
+ encodingToken.setText(this.encodingName);
+ encodingToken.setLine(0);
+ encodingToken.setCharPositionInLine(-1);
+ this.addPendingToken(encodingToken);
+ }
+
+ private void insertLeadingIndentToken() {
+ if (this.previousPendingTokenType == PythonLexer.WS) {
+ Token prevToken = this.pendingTokens.peekLast(); // WS token
+ if (this.getIndentationLength(prevToken.getText()) != 0) { // there is an "indentation" before the first statement
+ final String errMsg = "first statement indented";
+ this.reportLexerError(errMsg);
+ // insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser
+ this.createAndAddPendingToken(PythonLexer.INDENT, ERR_TXT + errMsg, this.curToken);
+ }
+ }
+ }
+
+ private void handleNEWLINEtoken() {
+ if (!this.lexerModeStack.isEmpty()) { // for multi line f/t-string literals
+ this.addPendingToken(this.curToken);
+ return;
+ }
+
+ if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
+ this.hideAndAddPendingToken(this.curToken);
+ return;
+ }
+
+ final Token nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
+ final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS;
+ if (isLookingAhead) {
+ this.setCurrentAndFollowingTokens(); // set the next two tokens
+ }
+
+ switch (this.ffgToken.getType()) {
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
+ this.hideAndAddPendingToken(nlToken);
+ if (isLookingAhead) {
+ this.addPendingToken(this.curToken); // WS token
+ }
+ break;
+ default:
+ this.addPendingToken(nlToken);
+ if (isLookingAhead) { // We're on a whitespace(s) followed by a statement
+ final int indentationLength = this.ffgToken.getType() == Token.EOF ?
+ 0 :
+ this.getIndentationLength(this.curToken.getText());
+
+ if (indentationLength != PythonLexerBase.INVALID_LENGTH) {
+ this.addPendingToken(this.curToken); // WS token
+ this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
+ } else {
+ this.reportError("inconsistent use of tabs and spaces in indentation");
+ }
+ } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
+ this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
+ }
+ }
+ }
+
+ private void insertIndentOrDedentToken(final int indentLength) {
+ int prevIndentLength = this.indentLengthStack.peek();
+ if (indentLength > prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.INDENT, null, this.ffgToken);
+ this.indentLengthStack.push(indentLength);
+ return;
+ }
+
+ while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
+ this.indentLengthStack.pop();
+ prevIndentLength = this.indentLengthStack.peek();
+ if (indentLength <= prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.DEDENT, null, this.ffgToken);
+ } else {
+ this.reportError("inconsistent dedent");
+ }
+ }
+ }
+
+ private void checkCurToken() {
+ switch (this.curToken.getType()) {
+ case PythonLexer.FSTRING_START:
+ this.curISTRING_MIDDLEtokenType = PythonLexer.FSTRING_MIDDLE;
+ this.setLexerModeByISTRING_STARTtoken();
+ return;
+ case PythonLexer.TSTRING_START:
+ this.curISTRING_MIDDLEtokenType = PythonLexer.TSTRING_MIDDLE;
+ this.setLexerModeByISTRING_STARTtoken();
+ return;
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ this.handleISTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field
+ switch (this.curToken.getType()) {
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ return; // No curToken exchange happened
+ }
+ break;
+ case PythonLexer.FSTRING_END:
+ case PythonLexer.TSTRING_END:
+ this.popLexerMode();
+ return;
+ default:
+ if (this.lexerModeStack.isEmpty()) {
+ return; // Not in f/t-string mode
+ }
+ }
+ this.processBraceExpression();
+ }
+
+ private void processBraceExpression() {
+ switch (this.curToken.getType()) { // the following tokens can only come from default mode (after an LBRACE in f/t-string)
+ case PythonLexer.NEWLINE:
+ // append the current brace expression with the current newline
+ this.appendToBraceExpression(this.curToken.getText());
+ final CommonToken nlToken = new CommonToken(this.curToken);
+ nlToken.setChannel(Token.HIDDEN_CHANNEL);
+ this.curToken = nlToken;
+ break;
+ case PythonLexer.LBRACE:
+ // the outermost brace expression cannot be a dictionary comprehension or a set comprehension
+ this.braceExpressionStack.push("{");
+ this.parenOrBracketOpenedStack.push(0);
+ this.pushLexerMode(Lexer.DEFAULT_MODE);
+ break;
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ // append the current brace expression with a "(" or a "["
+ this.appendToBraceExpression(this.curToken.getText());
+ // https://peps.python.org/pep-0498/#lambdas-inside-expressions
+ this.incrementBraceStack();
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ // append the current brace expression with a ")" or a "]"
+ this.appendToBraceExpression(this.curToken.getText());
+ this.decrementBraceStack();
+ break;
+ case PythonLexer.COLON:
+ case PythonLexer.COLONEQUAL:
+ // append the current brace expression with a ":" or a ":="
+ this.appendToBraceExpression(this.curToken.getText());
+ this.setLexerModeByCOLONorCOLONEQUALtoken();
+ break;
+ case PythonLexer.RBRACE:
+ this.setLexerModeAfterRBRACEtoken();
+ break;
+ default:
+ // append the current brace expression with the current token text
+ this.appendToBraceExpression(this.curToken.getText());
+ }
+ }
+
+ private void appendToBraceExpression(final String text) {
+ final String top = this.braceExpressionStack.pop();
+ this.braceExpressionStack.push(top + text);
+ }
+
+ private void incrementBraceStack() { // increment the last element
+ this.parenOrBracketOpenedStack.push(this.parenOrBracketOpenedStack.pop() + 1);
+ }
+
+ private void decrementBraceStack() { // decrement the last element
+ this.parenOrBracketOpenedStack.push(this.parenOrBracketOpenedStack.pop() - 1);
+ }
+
+ private void setLexerModeAfterRBRACEtoken() {
+ switch (this.curLexerMode) {
+ case Lexer.DEFAULT_MODE:
+ this.popLexerMode();
+ this.popByBRACE();
+ break;
+ case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.popLexerMode();
+ this.popLexerMode();
+ this.popByBRACE();
+ break;
+ default:
+ this.reportLexerError("f-string: single '}' is not allowed");
+ }
+ }
+
+ private void setLexerModeByISTRING_STARTtoken() { // ISTRING = interpolated string (FSTRING or TSTRING)
+ if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.isEmpty()) {
+ PythonLexerBase.initLexerModesForIStringStart();
+ }
+
+ final String interpolatedStringPrefix = this.curToken.getText().toLowerCase();
+ final Integer newLexerMode = PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.get(interpolatedStringPrefix);
+ if (newLexerMode != null) {
+ this.pushLexerMode(newLexerMode);
+ } else {
+ this.reportLexerError("internal error: unknown interpolated string literal prefix: " + this.curToken.getText());
+ }
+ }
+
+ private static void initLexerModesForIStringStart() {
+ // f-strings
+ LEXER_MODES_FOR_ISTRING_START.put("f'", PythonLexer.SQ1__FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("rf'", PythonLexer.SQ1R_FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("fr'", PythonLexer.SQ1R_FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("f\"", PythonLexer.DQ1__FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("rf\"", PythonLexer.DQ1R_FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("fr\"", PythonLexer.DQ1R_FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("f'''", PythonLexer.SQ3__FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("rf'''", PythonLexer.SQ3R_FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("fr'''", PythonLexer.SQ3R_FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("f\"\"\"", PythonLexer.DQ3__FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE);
+
+ // t-strings
+ LEXER_MODES_FOR_ISTRING_START.put("t'", PythonLexer.SQ1__TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("rt'", PythonLexer.SQ1R_TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("tr'", PythonLexer.SQ1R_TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("t\"", PythonLexer.DQ1__TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("rt\"", PythonLexer.DQ1R_TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("tr\"", PythonLexer.DQ1R_TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("t'''", PythonLexer.SQ3__TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("rt'''", PythonLexer.SQ3R_TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("tr'''", PythonLexer.SQ3R_TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("t\"\"\"", PythonLexer.DQ3__TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("rt\"\"\"", PythonLexer.DQ3R_TSTRING_MODE);
+ LEXER_MODES_FOR_ISTRING_START.put("tr\"\"\"", PythonLexer.DQ3R_TSTRING_MODE);
+ }
+
+ private void setLexerModeByCOLONorCOLONEQUALtoken() {
+ // Exit early when the current lexer mode indicates an open parenthesis/bracket
+ if (this.parenOrBracketOpenedStack.peek() != 0) {
+ return;
+ }
+
+ // COLONEQUAL token will be replaced with a COLON token in checkNextToken()
+ final int prevLexerMode = lexerModeStack.peek();
+ switch (prevLexerMode) { // check the previous lexer mode (the current is DEFAULT_MODE)
+ case PythonLexer.SQ1__FSTRING_MODE:
+ case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.SQ1__TSTRING_MODE:
+ case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.SQ1R_FSTRING_MODE:
+ case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.SQ1R_TSTRING_MODE:
+ case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.DQ1__FSTRING_MODE:
+ case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.DQ1__TSTRING_MODE:
+ case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.DQ1R_FSTRING_MODE:
+ case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.DQ1R_TSTRING_MODE:
+ case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.SQ3__FSTRING_MODE:
+ case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.SQ3__TSTRING_MODE:
+ case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.SQ3R_FSTRING_MODE:
+ case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.SQ3R_TSTRING_MODE:
+ case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.DQ3__FSTRING_MODE:
+ case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.DQ3__TSTRING_MODE:
+ case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.DQ3R_FSTRING_MODE:
+ case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.DQ3R_TSTRING_MODE:
+ case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ }
+ }
+
+ private void popByBRACE() {
+ this.parenOrBracketOpenedStack.pop();
+ String curBraceExpression = this.braceExpressionStack.pop();
+ this.prevBraceExpression = curBraceExpression + "}";
+ if (!this.braceExpressionStack.isEmpty()) {
+ // Extend the current brace expression by adding the previous expression
+ curBraceExpression = this.braceExpressionStack.pop();
+ this.braceExpressionStack.push(curBraceExpression + this.prevBraceExpression);
+ }
+ }
+
+ private void handleISTRING_MIDDLEtokenWithDoubleBrace() { // ISTRING = interpolated string (FSTRING or TSTRING)
+ // replace the trailing double brace with a single brace and insert a hidden brace token
+ final String lastTwoChars = this.getLastTwoCharsOfTheCurTokenText();
+ switch (lastTwoChars) {
+ case "{{":
+ this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL);
+ break;
+ case "}}":
+ this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL);
+ break;
+ }
+ }
+
+ private void handleISTRING_MIDDLEtokenWithQuoteAndLBrace() { // ISTRING = interpolated string (FSTRING or TSTRING)
+ // replace the trailing quote + left_brace with a quote and insert an LBRACE token
+ // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
+ final String lastTwoChars = this.getLastTwoCharsOfTheCurTokenText();
+ switch (lastTwoChars) {
+ case "\"{":
+ case "'{":
+ case "\\{":
+ this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL);
+ break;
+ }
+ }
+
+ private String getLastTwoCharsOfTheCurTokenText() {
+ final String text = this.curToken.getText();
+ return text.length() >= 2 ? text.substring(text.length() - 2) : text;
+ }
+
+ private void trimLastCharAddPendingTokenSetCurToken(final int type, final String text, final int channel) {
+ // trim the last char and add the modified curToken to the pendingTokens stack
+ final String curTokenText = this.curToken.getText();
+ final String tokenTextWithoutLastChar = curTokenText.substring(0, curTokenText.length() - 1);
+ final CommonToken token = new CommonToken(this.curToken);
+ token.setText(tokenTextWithoutLastChar);
+ token.setStopIndex(token.getStopIndex() - 1);
+ this.addPendingToken(token);
+
+ this.createNewCurToken(type, text, channel); // set curToken
+ }
+
+ private void handleCOLONEQUALtokenInIString() { // ISTRING = interpolated string (FSTRING or TSTRING)
+ if (!this.lexerModeStack.isEmpty() &&
+ this.parenOrBracketOpenedStack.peek() == 0) {
+
+ // In an f/t-string, the walrus operator (:=) is only allowed inside parentheses.
+ // If used outside, split the COLONEQUAL token into a COLON
+ // (used as a format specifier instead of a walrus operator),
+ // and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE).
+ CommonToken colonequalToken = new CommonToken(this.curToken);
+ colonequalToken.setType(PythonLexer.COLON);
+ colonequalToken.setText(":");
+ colonequalToken.setStopIndex(colonequalToken.getStartIndex());
+ this.curToken = colonequalToken;
+
+ switch (this.ffgToken.getType()) {
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ colonequalToken = new CommonToken(this.ffgToken);
+ colonequalToken.setText("=" + colonequalToken.getText());
+ colonequalToken.setStartIndex(colonequalToken.getStartIndex() - 1);
+ colonequalToken.setCharPositionInLine(colonequalToken.getCharPositionInLine() - 1);
+ this.ffgToken = colonequalToken;
+ break;
+ default:
+ this.addPendingToken(this.curToken);
+ this.createNewCurToken(this.curISTRING_MIDDLEtokenType, "=", Token.DEFAULT_CHANNEL);
+ }
+ }
+ this.addPendingToken(this.curToken);
+ }
+
+ private void createNewCurToken(final int type, final String text, final int channel) {
+ final CommonToken token = new CommonToken(this.curToken);
+ token.setType(type);
+ token.setText(text);
+ token.setChannel(channel);
+ token.setCharPositionInLine(token.getCharPositionInLine() + 1);
+ token.setStartIndex(token.getStartIndex() + 1);
+ token.setStopIndex(token.getStartIndex());
+ this.curToken = token;
+ }
+
+ private void pushLexerMode(final int mode) {
+ this.pushMode(mode);
+ this.lexerModeStack.push(this.curLexerMode);
+ this.curLexerMode = mode;
+ }
+
+ private void popLexerMode() {
+ this.popMode();
+ this.curLexerMode = this.lexerModeStack.pop();
+ }
+
+ private void handleFORMAT_SPECIFICATION_MODE() {
+ if (this.lexerModeStack.isEmpty() || this.ffgToken.getType() != PythonLexer.RBRACE) {
+ return;
+ }
+
+ // insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification
+ switch (this.curToken.getType()) {
+ case PythonLexer.COLON:
+ this.createAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken);
+ break;
+ case PythonLexer.RBRACE:
+ // only when the previous brace expression is not a dictionary comprehension or set comprehension
+ if (!isValid_DictionaryOrSet_ComprehensionExpression(this.prevBraceExpression)) {
+ this.createAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ private boolean isValid_DictionaryOrSet_ComprehensionExpression(final String code) {
+ final CharStream inputStream = CharStreams.fromString(code);
+ final PythonLexer lexer = new PythonLexer(inputStream);
+ final CommonTokenStream tokenStream = new CommonTokenStream(lexer);
+ PythonParser parser = new PythonParser(tokenStream);
+
+ // Disable error listeners to suppress console output
+ lexer.removeErrorListeners();
+ parser.removeErrorListeners();
+
+ parser.dictcomp(); // Try parsing as dictionary comprehension
+ if (parser.getNumberOfSyntaxErrors() == 0)
+ return true;
+
+ parser = new PythonParser(tokenStream);
+ tokenStream.seek(0);
+ parser.removeErrorListeners();
+ parser.setcomp(); // Try parsing as set comprehension
+ return parser.getNumberOfSyntaxErrors() == 0;
+ }
+
+ private void insertTrailingTokens() {
+ switch (this.lastPendingTokenTypeFromDefaultChannel) {
+ case PythonLexer.NEWLINE:
+ case PythonLexer.DEDENT:
+ break; // no trailing NEWLINE token is needed
+ default: // insert an extra trailing NEWLINE token that serves as the end of the last statement
+ this.createAndAddPendingToken(PythonLexer.NEWLINE, null, this.ffgToken); // ffgToken is EOF
+ }
+ this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
+ }
+
+ private void handleEOFtoken() {
+ if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
+ // there was a statement in the input (leading NEWLINE tokens are hidden)
+ this.insertTrailingTokens();
+ }
+ this.addPendingToken(this.curToken);
+ }
+
+ private void hideAndAddPendingToken(final Token originalToken) {
+ final CommonToken token = new CommonToken(originalToken);
+ token.setChannel(Token.HIDDEN_CHANNEL);
+ this.addPendingToken(token);
+ }
+
+ private void createAndAddPendingToken(final int tokenType, final String text, final Token originalToken) {
+ final CommonToken token = new CommonToken(originalToken);
+ token.setType(tokenType);
+ token.setChannel(Token.DEFAULT_CHANNEL);
+ token.setStopIndex(originalToken.getStartIndex() - 1);
+ token.setText(text == null ?
+ "<" + this.getVocabulary().getSymbolicName(tokenType) + ">" :
+ text);
+
+ this.addPendingToken(token);
+ }
+
+ private void addPendingToken(final Token token) {
+ // save the last pending token type because the pendingTokens list can be empty by the nextToken()
+ this.previousPendingTokenType = token.getType();
+ if (token.getChannel() == Token.DEFAULT_CHANNEL) {
+ this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
+ }
+ this.pendingTokens.addLast(token);
+ }
+
+ private int getIndentationLength(final String indentText) { // the indentText may contain spaces, tabs or form feeds
+ int length = 0;
+ for (char ch : indentText.toCharArray()) {
+ switch (ch) {
+ case ' ':
+ this.wasSpaceIndentation = true;
+ length += 1;
+ break;
+ case '\t':
+ this.wasTabIndentation = true;
+ length += PythonLexerBase.TAB_LENGTH - (length % PythonLexerBase.TAB_LENGTH);
+ break;
+ case '\f': // form feed
+ length = 0;
+ break;
+ }
+ }
+
+ if (this.wasTabIndentation && this.wasSpaceIndentation) {
+ if (!(this.wasIndentationMixedWithSpacesAndTabs)) {
+ this.wasIndentationMixedWithSpacesAndTabs = true;
+ length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent
+ }
+ }
+ return length;
+ }
+
+ private void reportLexerError(final String errMsg) {
+ this.getErrorListenerDispatch().syntaxError(this, this.curToken.getType(), this.curToken.getLine(), this.curToken.getCharPositionInLine(), " LEXER" + ERR_TXT + errMsg, null);
+ }
+
+ private void reportError(final String errMsg) {
+ this.reportLexerError(errMsg);
+ this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, ERR_TXT + errMsg, this.ffgToken);
+ // the ERRORTOKEN also triggers a parser error
+ }
+}
diff --git a/python/python3_14/JavaScript/PythonLexerBase.js b/python/python3_14/JavaScript/PythonLexerBase.js
new file mode 100644
index 0000000000..53280d051d
--- /dev/null
+++ b/python/python3_14/JavaScript/PythonLexerBase.js
@@ -0,0 +1,778 @@
+/*
+The MIT License (MIT)
+Copyright (c) 2021 Robert Einhorn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+ */
+
+/*
+ *
+ * Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation,
+ * interpolated strings, and encoding declaration.
+ *
+ * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com
+ *
+ */
+
+import { CharStreams, CommonTokenStream, Token, CommonToken, Lexer } from "antlr4";
+import PythonLexer from "./PythonLexer.js";
+import PythonParser from "./PythonParser.js";
+
+export default class PythonLexerBase extends Lexer {
+ static #LEXER_MODES_FOR_ISTRING_START = new Map();
+ static #INVALID_LENGTH = -1;
+ static #ERR_TXT = " ERROR: ";
+ static #TAB_LENGTH = 8;
+
+ #encodingName;
+
+ // Indentation handling
+ #indentLengthStack;
+ #pendingTokens;
+
+ #previousPendingTokenType;
+ #lastPendingTokenTypeFromDefaultChannel;
+
+ // Parenthesis / bracket / brace counts
+ #opened;
+ #paren_or_bracket_openedStack;
+ #braceExpressionStack;
+ #prevBraceExpression;
+
+ // Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE)
+ #curISTRING_MIDDLEtokenType;;
+
+ // We reimplement mode/stack because not all runtimes expose _mode/_modeStack
+ #curLexerMode;
+ #lexerModeStack;
+
+ // Indentation diagnostics
+ #wasSpaceIndentation;
+ #wasTabIndentation;
+ #wasIndentationMixedWithSpacesAndTabs;
+
+ // Current / lookahead tokens
+ #curToken;
+ #ffgToken;
+
+ constructor(input) {
+ super(input);
+ this.#init();
+ }
+
+ reset() {
+ this.#init();
+ super.reset();
+ }
+
+ #init() {
+ this.#encodingName = "";
+ this.#indentLengthStack = [];
+ this.#pendingTokens = [];
+ this.#previousPendingTokenType = 0;
+ this.#lastPendingTokenTypeFromDefaultChannel = 0;
+ this.#opened = 0;
+ this.#paren_or_bracket_openedStack = [];
+ this.#braceExpressionStack = [];
+ this.#prevBraceExpression = "";
+ this.#curISTRING_MIDDLEtokenType = 0;
+ this.#curLexerMode = Lexer.DEFAULT_MODE;
+ this.#lexerModeStack = [];
+ this.#wasSpaceIndentation = false;
+ this.#wasTabIndentation = false;
+ this.#wasIndentationMixedWithSpacesAndTabs = false;
+ this.#curToken = null;
+ this.#ffgToken = null;
+ }
+
+ /**
+ * Sets the encoding name to emit an ENCODING token at the start of the token stream.
+ * Leave empty if not needed (e.g., when parsing from string).
+ *
+ * @param {string} encodingName - The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token.
+ */
+ setEncodingName(encodingName) {
+ this.#encodingName = encodingName;
+ }
+
+ nextToken() { // Reading the input stream until EOF is reached
+ this.#checkNextToken();
+ return this.#pendingTokens.shift() /* stack pollFirst() */; // Add the queued token to the token stream
+ }
+
+ #checkNextToken() {
+ if (this.#previousPendingTokenType === Token.EOF) {
+ return;
+ }
+
+ this.#setCurrentAndFollowingTokens();
+ if (this.#indentLengthStack.length === 0) { // We're at the first token
+ this.#handleStartOfInput();
+ }
+
+ switch (this.#curToken.type) {
+ case PythonLexer.NEWLINE:
+ this.#handleNEWLINEtoken();
+ break;
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ case PythonLexer.LBRACE:
+ this.#opened++;
+ this.#addPendingToken(this.#curToken);
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ case PythonLexer.RBRACE:
+ this.#opened--;
+ this.#addPendingToken(this.#curToken);
+ break;
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ this.#handleFSTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field
+ this.#addPendingToken(this.#curToken);
+ break;
+ case PythonLexer.COLONEQUAL:
+ this.#handleCOLONEQUALtokenInIString();
+ break;
+ case PythonLexer.ERRORTOKEN:
+ this.#reportLexerError(`token recognition error at: '${this.#curToken.text}'`);
+ this.#addPendingToken(this.#curToken);
+ break;
+ case Token.EOF:
+ this.#handleEOFtoken();
+ break;
+ default:
+ this.#addPendingToken(this.#curToken);
+ }
+ this.#handleFORMAT_SPECIFICATION_MODE();
+ }
+
+ #setCurrentAndFollowingTokens() {
+ this.#curToken = this.#ffgToken == undefined ?
+ super.nextToken() :
+ this.#ffgToken;
+
+ this.#checkCurToken(); // Do not use ffgToken in this method or any of its submethods — it hasn't been set yet!
+
+ this.#ffgToken = this.#curToken.type === Token.EOF ?
+ this.#curToken :
+ super.nextToken();
+ }
+
+ // - initialize indent stack
+ // - skip BOM token
+ // - insert ENCODING token (if any)
+ // - hide leading NEWLINE(s)
+ // - insert leading INDENT if first statement is indented
+ #handleStartOfInput() {
+ // initialize the stack with a default 0 indentation length
+ this.#indentLengthStack.push(0); // this will never be popped off
+
+ if (this.#curToken.type === PythonLexer.BOM) {
+ this.#setCurrentAndFollowingTokens();
+ }
+
+ this.#insertENCODINGtoken();
+
+ while (this.#curToken.type !== Token.EOF) {
+ if (this.#curToken.channel === Token.DEFAULT_CHANNEL) {
+ if (this.#curToken.type === PythonLexer.NEWLINE) {
+ // all the NEWLINE tokens must be ignored before the first statement
+ this.#hideAndAddPendingToken(this.#curToken);
+ } else { // We're at the first statement
+ this.#insertLeadingIndentToken();
+ return; // continue the processing of the current token with #checkNextToken()
+ }
+ } else {
+ this.#addPendingToken(this.#curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ }
+ this.#setCurrentAndFollowingTokens();
+ } // continue the processing of the EOF token with #checkNextToken()
+ }
+
+ #insertENCODINGtoken() {
+ if (this.#encodingName === "") return
+
+ const sourcePair = [this, this._input];
+ const encodingToken = new CommonToken(sourcePair, PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, /*start*/ 0, /*stop*/ 0);
+ encodingToken.text = this.#encodingName;
+ encodingToken.line = 0;
+ encodingToken.column = -1;
+ this.#addPendingToken(encodingToken);
+ }
+
+ #insertLeadingIndentToken() {
+ if (this.#previousPendingTokenType === PythonLexer.WS) {
+ const prevToken = this.#pendingTokens.at(-1); /* stack peek */ // WS token
+ if (this.#getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
+ const errMsg = "first statement indented";
+ this.#reportLexerError(errMsg);
+ // insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser
+ this.#createAndAddPendingToken(PythonLexer.INDENT, PythonLexerBase.#ERR_TXT + errMsg, this.#curToken);
+ }
+ }
+ }
+
+ #handleNEWLINEtoken() {
+ if (this.#lexerModeStack.length > 0) { // for multi line f/t-string literals
+ this.#addPendingToken(this.#curToken);
+ return;
+ }
+
+ if (this.#opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
+ this.#hideAndAddPendingToken(this.#curToken);
+ return;
+ }
+
+ const nlToken = this.#curToken.clone(); // save the current NEWLINE token
+ const isLookingAhead = this.#ffgToken.type === PythonLexer.WS;
+ if (isLookingAhead) {
+ this.#setCurrentAndFollowingTokens(); // set the next two tokens
+ }
+
+ switch (this.#ffgToken.type) {
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
+ this.#hideAndAddPendingToken(nlToken);
+ if (isLookingAhead) {
+ this.#addPendingToken(this.#curToken); // WS token
+ }
+ break;
+ default:
+ this.#addPendingToken(nlToken);
+ if (isLookingAhead) { // We're on a whitespace(s) followed by a statement
+ const indentationLength = this.#ffgToken.type === Token.EOF ?
+ 0 :
+ this.#getIndentationLength(this.#curToken.text);
+
+ if (indentationLength !== PythonLexerBase.#INVALID_LENGTH) {
+ this.#addPendingToken(this.#curToken); // WS token
+ this.#insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
+ } else {
+ this.#reportError("inconsistent use of tabs and spaces in indentation");
+ }
+ } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
+ this.#insertIndentOrDedentToken(0); // may insert DEDENT token(s)
+ }
+ }
+ }
+
+ #insertIndentOrDedentToken(curIndentLength) {
+ let prevIndentLength = this.#indentLengthStack.at(-1) /* stack peek */;
+ if (curIndentLength > prevIndentLength) {
+ this.#createAndAddPendingToken(PythonLexer.INDENT, null, this.#ffgToken);
+ this.#indentLengthStack.push(curIndentLength);
+ return;
+ }
+
+ while (curIndentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
+ this.#indentLengthStack.pop();
+ prevIndentLength = this.#indentLengthStack.at(-1) /* stack peek */;
+ if (curIndentLength <= prevIndentLength) {
+ this.#createAndAddPendingToken(PythonLexer.DEDENT, null, this.#ffgToken);
+ } else {
+ this.#reportError("inconsistent dedent");
+ }
+ }
+ }
+
+ #checkCurToken() {
+ switch (this.#curToken.type) {
+ case PythonLexer.FSTRING_START:
+ this.#curISTRING_MIDDLEtokenType = PythonLexer.FSTRING_MIDDLE;
+ this.#setLexerModeByISTRING_STARTtoken();
+ return;
+ case PythonLexer.TSTRING_START:
+ this.#curISTRING_MIDDLEtokenType = PythonLexer.TSTRING_MIDDLE;
+ this.#setLexerModeByISTRING_STARTtoken();
+ return;
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ this.#handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field
+ switch (this.#curToken.type) {
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ return; // No curToken exchange happened
+ }
+ break;
+ case PythonLexer.FSTRING_END:
+ case PythonLexer.TSTRING_END:
+ this.#popLexerMode();
+ return;
+ default:
+ if (this.#lexerModeStack.length === 0) {
+ return; // Not in fstring mode
+ }
+
+ }
+ this.#processBraceExpression();
+ }
+
+ #processBraceExpression() {
+ switch (this.#curToken.type) { // the following tokens can only come from default mode (after an LBRACE in f/t-string)
+ case PythonLexer.NEWLINE:
+ // append the current brace expression with the current newline
+ this.#appendToBraceExpression(this.#curToken.text)
+ this.#curToken.channel = Token.HIDDEN_CHANNEL;
+ break;
+ case PythonLexer.LBRACE:
+ // the outermost brace expression cannot be a dictionary comprehension or a set comprehension
+ this.#braceExpressionStack.push("{");
+ this.#paren_or_bracket_openedStack.push(0);
+ this.#pushLexerMode(Lexer.DEFAULT_MODE);
+ break;
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ // append the current brace expression with a "(" or a "["
+ this.#appendToBraceExpression(this.#curToken.text)
+ // https://peps.python.org/pep-0498/#lambdas-inside-expressions
+ this.#incrementBraceStack();
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ // append the current brace expression with a ")" or a "]"
+ this.#appendToBraceExpression(this.#curToken.text)
+ this.#decrementBraceStack();
+ break;
+ case PythonLexer.COLON:
+ case PythonLexer.COLONEQUAL:
+ // append the current brace expression with a ":" or a ":="
+ this.#appendToBraceExpression(this.#curToken.text)
+ this.#setLexerModeByCOLONorCOLONEQUALtoken();
+ break;
+ case PythonLexer.RBRACE:
+ this.#setLexerModeAfterRBRACEtoken();
+ break;
+ default:
+ // append the current brace expression with the current token text
+ this.#appendToBraceExpression(this.#curToken.text)
+ }
+ }
+
+ #appendToBraceExpression(text) {
+ const lastIndex = this.#braceExpressionStack.length - 1;
+ this.#braceExpressionStack[lastIndex] += text;
+ }
+
+ #incrementBraceStack() { // increment the last element (stack peek + 1)
+ const lastIndex = this.#paren_or_bracket_openedStack.length - 1;
+ this.#paren_or_bracket_openedStack[lastIndex]++;
+ }
+
+ #decrementBraceStack() { // decrement the last element (stack peek - 1)
+ const lastIndex = this.#paren_or_bracket_openedStack.length - 1;
+ this.#paren_or_bracket_openedStack[lastIndex]--;
+ }
+
+ #setLexerModeAfterRBRACEtoken() {
+ switch (this.#curLexerMode) {
+ case Lexer.DEFAULT_MODE:
+ this.#popLexerMode();
+ this.#popByBRACE();
+ break;
+ case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#popLexerMode();
+ this.#popLexerMode();
+ this.#popByBRACE();
+ break;
+ default:
+ this.#reportLexerError("f-string: single '}' is not allowed");
+ }
+ }
+
+ #setLexerModeByISTRING_STARTtoken() { // ISTRING = interpolated string (FSTRING or TSTRING)
+ if (PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.size === 0) {
+ PythonLexerBase.#initLexerModesForIStringStart();
+ }
+
+ const interpolatedStringPrefix = this.#curToken.text.toLowerCase();
+ if (PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.has(interpolatedStringPrefix)) {
+ const newLexerMode = PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.get(interpolatedStringPrefix);
+ this.#pushLexerMode(newLexerMode);
+ } else {
+ this.#reportLexerError(
+ "internal error: unknown interpolated string literal prefix: " + this.#curToken.text
+ );
+ }
+ }
+
+ static #initLexerModesForIStringStart() {
+ // f-strings
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("f'", PythonLexer.SQ1__FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rf'", PythonLexer.SQ1R_FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("fr'", PythonLexer.SQ1R_FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("f\"", PythonLexer.DQ1__FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rf\"", PythonLexer.DQ1R_FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("fr\"", PythonLexer.DQ1R_FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("f'''", PythonLexer.SQ3__FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rf'''", PythonLexer.SQ3R_FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("fr'''", PythonLexer.SQ3R_FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("f\"\"\"", PythonLexer.DQ3__FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE);
+
+ // t-strings
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("t'", PythonLexer.SQ1__TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rt'", PythonLexer.SQ1R_TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("tr'", PythonLexer.SQ1R_TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("t\"", PythonLexer.DQ1__TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rt\"", PythonLexer.DQ1R_TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("tr\"", PythonLexer.DQ1R_TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("t'''", PythonLexer.SQ3__TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rt'''", PythonLexer.SQ3R_TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("tr'''", PythonLexer.SQ3R_TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("t\"\"\"", PythonLexer.DQ3__TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("rt\"\"\"", PythonLexer.DQ3R_TSTRING_MODE);
+ PythonLexerBase.#LEXER_MODES_FOR_ISTRING_START.set("tr\"\"\"", PythonLexer.DQ3R_TSTRING_MODE);
+ }
+
+ #setLexerModeByCOLONorCOLONEQUALtoken() {
+ // Exit early when the current lexer mode indicates an open parenthesis/bracket
+ const opened = this.#paren_or_bracket_openedStack.at(-1) > 0; /* stack peek */
+ if (opened) {
+ return;
+ }
+
+ // COLONEQUAL token will be replaced with a COLON token in CheckNextToken()
+ const prevLexerMode = this.#lexerModeStack.at(-1); /* stack peek */
+ switch (prevLexerMode) {
+ case PythonLexer.SQ1__FSTRING_MODE:
+ case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ1__TSTRING_MODE:
+ case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ1R_FSTRING_MODE:
+ case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ1R_TSTRING_MODE:
+ case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ1__FSTRING_MODE:
+ case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ1__TSTRING_MODE:
+ case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ1R_FSTRING_MODE:
+ case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ1R_TSTRING_MODE:
+ case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ3__FSTRING_MODE:
+ case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ3__TSTRING_MODE:
+ case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ3R_FSTRING_MODE:
+ case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ3R_TSTRING_MODE:
+ case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ3__FSTRING_MODE:
+ case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ3__TSTRING_MODE:
+ case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ3R_FSTRING_MODE:
+ case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ3R_TSTRING_MODE:
+ case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.#pushLexerMode(PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+ }
+ }
+
+ #popByBRACE() {
+ this.#paren_or_bracket_openedStack.pop();
+ const curBraceExpression = this.#braceExpressionStack.pop();
+ this.#prevBraceExpression = curBraceExpression + "}";
+ if (this.#braceExpressionStack.length > 0) {
+ // Extend the current brace expression by adding the previous expression
+ const lastIndex = this.#braceExpressionStack.length - 1;
+ this.#braceExpressionStack[lastIndex] += this.#prevBraceExpression;
+ }
+ }
+
+ #handleFSTRING_MIDDLEtokenWithDoubleBrace() { // ISTRING = interpolated string (FSTRING or TSTRING)
+ // replace the trailing double brace with a single brace and insert a hidden brace token
+ const lastTwoChars = this.#getLastTwoCharsOfTheCurTokenText();
+ switch (lastTwoChars) {
+ case "{{":
+ this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL);
+ break;
+ case "}}":
+ this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL);
+ break;
+ }
+ }
+
+ #handleFSTRING_MIDDLEtokenWithQuoteAndLBrace() { // ISTRING = interpolated string (FSTRING or TSTRING)
+ // replace the trailing quote + left_brace with a quote and insert an LBRACE token
+ // replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
+ const lastTwoChars = this.#getLastTwoCharsOfTheCurTokenText();
+ switch (lastTwoChars) {
+ case "\"{":
+ case "'{":
+ case "\\{":
+ this.#trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL);
+ break;
+ }
+ }
+
+ #getLastTwoCharsOfTheCurTokenText() {
+ const text = this.#curToken.text;
+ return text.length <= 2 ? text : text.slice(-2);
+ }
+
+ #trimLastCharAddPendingTokenSetCurToken(type, text, channel) {
+ // trim the last char and add the modified curToken to the pendingTokens stack
+ const tokenTextWithoutLastChar = this.#curToken.text.slice(0, -1);
+ this.#curToken.text = tokenTextWithoutLastChar;
+ this.#curToken.stop -= 1;
+ this.#addPendingToken(this.#curToken);
+
+ this.#createNewCurToken(type, text, channel); // set curToken
+ }
+
+ #handleCOLONEQUALtokenInIString() { // ISTRING = interpolated string (FSTRING or TSTRING)
+ if (this.#lexerModeStack.length > 0 &&
+ this.#paren_or_bracket_openedStack.at(-1) === 0) { // stack peek === 0
+
+ // In an f/t-string, the walrus operator (:=) is only allowed inside parentheses.
+ // If used outside, split the COLONEQUAL token into a COLON
+ // (used as a format specifier instead of a walrus operator),
+ // and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE).
+ this.#curToken.type = PythonLexer.COLON;
+ this.#curToken.text = ":";
+ this.#curToken.stop = this.#curToken.start;
+
+ switch (this.#ffgToken.type) {
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE: {
+ const token = this.#ffgToken.clone();
+ token.text = "=" + token.text;
+ token.start -= 1;
+ token.column -= 1;
+ this.#ffgToken = token;
+ break;
+ }
+ default: {
+ this.#addPendingToken(this.#curToken);
+ this.#createNewCurToken(this.#curISTRING_MIDDLEtokenType, "=", Token.DEFAULT_CHANNEL);
+ }
+ }
+ }
+ this.#addPendingToken(this.#curToken);
+ }
+
+ #createNewCurToken(type, text, channel) {
+ const token = this.#curToken.clone();
+ token.type = type;
+ token.text = text;
+ token.channel = channel;
+ token.column += 1;
+ token.start += 1;
+ token.stop = token.start;
+ this.#curToken = token;
+ }
+
+ #pushLexerMode(mode) {
+ this.pushMode(mode);
+ this.#lexerModeStack.push(this.#curLexerMode);
+ this.#curLexerMode = mode;
+ }
+
+ #popLexerMode() {
+ this.popMode();
+ this.#curLexerMode = this.#lexerModeStack.pop();
+ }
+
+ #handleFORMAT_SPECIFICATION_MODE() {
+ if (this.#lexerModeStack.length == 0 || this.#ffgToken.type !== PythonLexer.RBRACE) {
+ return;
+ }
+
+ // insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification
+ switch (this.#curToken.type) {
+ case PythonLexer.COLON:
+ this.#createAndAddPendingToken(this.#curISTRING_MIDDLEtokenType, "", this.#ffgToken);
+ break;
+ case PythonLexer.RBRACE:
+ // only when the previous brace expression is not a dictionary comprehension or set comprehension
+ if (!this.#isValid_DictionaryOrSet_ComprehensionExpression(this.#prevBraceExpression)) {
+ this.#createAndAddPendingToken(this.#curISTRING_MIDDLEtokenType, "", this.#ffgToken);
+ }
+ break;
+ }
+ }
+
+ #isValid_DictionaryOrSet_ComprehensionExpression(code) {
+ const inputStream = CharStreams.fromString(code);
+ const lexer = new PythonLexer(inputStream);
+ const tokenStream = new CommonTokenStream(lexer);
+ let parser = new PythonParser(tokenStream);
+
+ // Disable error listeners to suppress console output
+ lexer.removeErrorListeners();
+ parser.removeErrorListeners();
+
+ parser.dictcomp(); // Try parsing as dictionary comprehension
+ if (parser.syntaxErrorsCount === 0)
+ return true;
+
+ parser = new PythonParser(tokenStream);
+ tokenStream.seek(0);
+ parser.removeErrorListeners();
+ parser.setcomp(); // Try parsing as set comprehension
+ return parser.syntaxErrorsCount === 0;
+ }
+
+ #insertTrailingTokens() {
+ switch (this.#lastPendingTokenTypeFromDefaultChannel) {
+ case PythonLexer.NEWLINE:
+ case PythonLexer.DEDENT:
+ break; // no trailing NEWLINE token is needed
+ default:
+ // insert an extra trailing NEWLINE token that serves as the end of the last statement
+ this.#createAndAddPendingToken(PythonLexer.NEWLINE, null, this.#ffgToken); // ffgToken is EOF
+ }
+ this.#insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
+ }
+
+ #handleEOFtoken() {
+ if (this.#lastPendingTokenTypeFromDefaultChannel > 0) {
+ // there was a statement in the input (leading NEWLINE tokens are hidden)
+ this.#insertTrailingTokens();
+ }
+ this.#addPendingToken(this.#curToken);
+ }
+
+ #hideAndAddPendingToken(originalToken) {
+ originalToken.channel = Token.HIDDEN_CHANNEL;
+ this.#addPendingToken(originalToken);
+ }
+
+ #createAndAddPendingToken(type, text, originalToken) {
+ const token = originalToken.clone();
+ token.type = type;
+ token.channel = Token.DEFAULT_CHANNEL;
+ token.stop = originalToken.start - 1;
+ token.text = text == null ?
+ `<${PythonLexer.symbolicNames[type] ?? ""}>` :
+ text;
+
+ this.#addPendingToken(token);
+ }
+
+ #addPendingToken(token) {
+ // save the last pending token type because the pendingTokens linked list can be empty by the nextToken()
+ this.#previousPendingTokenType = token.type;
+ if (token.channel === Token.DEFAULT_CHANNEL) {
+ this.#lastPendingTokenTypeFromDefaultChannel = this.#previousPendingTokenType;
+ }
+ this.#pendingTokens.push(token) /* .addLast(token) */;
+ }
+
+ #getIndentationLength(indentText) { // the indentText may contain spaces, tabs or form feeds
+ let length = 0;
+ for (let ch of indentText) {
+ switch (ch) {
+ case " ":
+ this.#wasSpaceIndentation = true;
+ length += 1;
+ break;
+ case "\t":
+ this.#wasTabIndentation = true;
+ length += PythonLexerBase.#TAB_LENGTH - (length % PythonLexerBase.#TAB_LENGTH);
+ break;
+ case "\f": // form feed
+ length = 0;
+ break;
+ }
+ }
+
+ if (this.#wasTabIndentation && this.#wasSpaceIndentation) {
+ if (!this.#wasIndentationMixedWithSpacesAndTabs) {
+ this.#wasIndentationMixedWithSpacesAndTabs = true;
+ length = PythonLexerBase.#INVALID_LENGTH; // only for the first inconsistent indent
+ }
+ }
+ return length;
+ }
+
+ #reportLexerError(errMsg) {
+ this.getErrorListener().syntaxError(this, this.#curToken.type, this.#curToken.line, this.#curToken.column, " LEXER" + PythonLexerBase.#ERR_TXT + errMsg, null);
+ }
+
+ #reportError(errMsg) {
+ this.#reportLexerError(errMsg);
+
+ this.#createAndAddPendingToken(PythonLexer.ERRORTOKEN, PythonLexerBase.#ERR_TXT + errMsg, this.#ffgToken);
+ // the ERRORTOKEN also triggers a parser error
+ }
+}
diff --git a/python/python3_14/Python3/PythonLexerBase.py b/python/python3_14/Python3/PythonLexerBase.py
new file mode 100644
index 0000000000..8311f59816
--- /dev/null
+++ b/python/python3_14/Python3/PythonLexerBase.py
@@ -0,0 +1,595 @@
+# The MIT License (MIT)
+# Copyright (c) 2021 Robert Einhorn
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation,
+# interpolated strings, and encoding declaration.
+#
+# Developed by : Robert Einhorn
+
+from collections import deque
+from typing import Literal, TextIO, Optional
+from antlr4 import InputStream, Lexer, Token
+from antlr4.Token import CommonToken
+import PythonLexer
+import sys
+
+INVALID_LENGTH: Literal[-1] = -1
+ERR_TXT: Literal[" ERROR: "] = " ERROR: "
+TAB_LENGTH: Literal[8] = 8
+
+class PythonLexerBase(Lexer):
+ _LEXER_MODES_FOR_ISTRING_START: dict[str, int] = {} # static field
+
+ def __init__(self, input: InputStream, output: TextIO = sys.stdout):
+ super().__init__(input, output)
+ self._init()
+
+ def reset(self) -> None:
+ self._init()
+ super().reset()
+
+ def _init(self) -> None:
+ self._encodingName: str = ""
+
+ # Indentation handling
+ self._indent_length_stack: list[int] = []
+ self._pending_tokens: deque[CommonToken] = deque()
+
+ self._previous_pending_token_type: int = 0
+ self._last_pending_token_type_from_default_channel = 0
+
+ # Parenthesis / bracket / brace counts
+ self._opened: int = 0
+ self._paren_or_bracket_opened_stack: list[int] = []
+ self._brace_expression_stack: list[str] = []
+ self._prev_brace_expression: str = ""
+
+ # Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE)
+ self._cur_ISTRING_MIDDLE_token_type: int = 0
+
+ # We reimplement mode/stack because not all runtimes expose _mode/_modeStack
+ self._cur_lexer_mode: int = Lexer.DEFAULT_MODE
+ self._lexer_mode_stack: list[int] = []
+
+ # Indentation diagnostics
+ self._was_space_indentation: bool = False
+ self._was_tab_indentation: bool = False
+ self._was_indentation_mixed_with_spaces_and_tabs: bool = False
+
+ # Current / lookahead tokens
+ self._cur_token: CommonToken = None
+ self._ffg_token: CommonToken = None
+
+ def set_encoding_name(self, encoding_name: str) -> None:
+ """
+ Sets the encoding name to emit an ENCODING token at the start of the token stream.
+ Leave empty if not needed (e.g., when parsing from string).
+
+ :param encoding_name: The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token.
+ """
+ self._encodingName = encoding_name
+
+ def nextToken(self) -> CommonToken: # Reading the input stream until EOF is reached
+ self._check_next_token()
+ return self._pending_tokens.popleft() # Add the queued token to the token stream
+
+ def _check_next_token(self) -> None:
+ if self._previous_pending_token_type == Token.EOF:
+ return
+
+ self._set_current_and_following_tokens()
+ if not self._indent_length_stack: # We're at the first token
+ self._handle_start_of_input()
+
+ match self._cur_token.type:
+ case self.NEWLINE:
+ self._handle_NEWLINE_token()
+ case self.LPAR | self.LSQB | self.LBRACE:
+ self._opened += 1
+ self._add_pending_token(self._cur_token)
+ case self.RPAR | self.RSQB | self.RBRACE:
+ self._opened -= 1
+ self._add_pending_token(self._cur_token)
+ case self.FSTRING_MIDDLE | self.TSTRING_MIDDLE:
+ self._handle_ISTRING_MIDDLE_token_with_double_brace() # does not affect the opened field
+ self._add_pending_token(self._cur_token)
+ case self.COLONEQUAL:
+ self._handle_COLONEQUAL_token_in_istring()
+ case self.ERRORTOKEN:
+ self._report_lexer_error("token recognition error at: '" + self._cur_token.text + "'")
+ self._add_pending_token(self._cur_token)
+ case Token.EOF:
+ self._handle_EOF_token()
+ case _:
+ self._add_pending_token(self._cur_token)
+ self._handle_FORMAT_SPECIFICATION_MODE()
+
+ def _set_current_and_following_tokens(self) -> None:
+ self._cur_token = super().nextToken() if self._ffg_token is None else \
+ self._ffg_token
+
+ self._check_cur_token() # Do not use ffgToken in this method or any of its submethods — it hasn't been set yet!
+
+ self._ffg_token = self._cur_token if self._cur_token.type == Token.EOF else \
+ super().nextToken()
+
+ # - initialize indent stack
+ # - skip BOM token
+ # - insert ENCODING token (if any)
+ # - hide leading NEWLINE(s)
+ # - insert leading INDENT if first statement is indented
+ def _handle_start_of_input(self) -> None:
+ # initialize the stack with a default 0 indentation length
+ self._indent_length_stack.append(0) # this will never be popped off
+
+ if self._cur_token.type == self.BOM:
+ self._set_current_and_following_tokens()
+ self._insert_ENCODING_token()
+
+ while self._cur_token.type != Token.EOF:
+ if self._cur_token.channel == Token.DEFAULT_CHANNEL:
+ if self._cur_token.type == self.NEWLINE:
+ # all the NEWLINE tokens must be ignored before the first statement
+ self._hide_and_add_pending_token(self._cur_token)
+ else: # We're at the first statement
+ self._insert_leading_indent_token()
+ return # continue the processing of the current token with _check_next_token()
+ else:
+ self._add_pending_token(self._cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ self._set_current_and_following_tokens()
+ # continue the processing of the EOF token with _check_next_token()
+
+ def _insert_ENCODING_token(self) -> None: # https://peps.python.org/pep-0263/
+ if not self._encodingName:
+ return
+
+ source_pair = self._tokenFactorySourcePair
+ encoding_token: CommonToken = CommonToken(source_pair, self.ENCODING, Token.HIDDEN_CHANNEL, start = 0, stop = 0)
+ encoding_token.text = self._encodingName
+ encoding_token.line = 0
+ encoding_token.column = -1
+ self._add_pending_token(encoding_token)
+
+ def _insert_leading_indent_token(self) -> None:
+ if self._previous_pending_token_type == self.WS:
+ prev_token: CommonToken = self._pending_tokens[-1] # stack peek, WS token
+ if self._get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement
+ err_msg: str = "first statement indented"
+ self._report_lexer_error(err_msg)
+ # insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser
+ self._create_and_add_pending_token(self.INDENT, ERR_TXT + err_msg, self._cur_token)
+
+ def _handle_NEWLINE_token(self) -> None:
+ if self._lexer_mode_stack: # for multi line f/t-string literals
+ self._add_pending_token(self._cur_token)
+ return
+
+ if self._opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token
+ self._hide_and_add_pending_token(self._cur_token)
+ return
+
+ nl_token: CommonToken = self._cur_token.clone() # save the current NEWLINE token
+ is_looking_ahead: bool = self._ffg_token.type == self.WS
+ if is_looking_ahead:
+ self._set_current_and_following_tokens() # set the next two tokens
+
+ match self._ffg_token.type:
+ case self.NEWLINE | self.COMMENT:
+ # We're before a blank line or a comment or type comment or a type ignore comment
+ self._hide_and_add_pending_token(nl_token) # ignore the NEWLINE token
+ if is_looking_ahead:
+ self._add_pending_token(self._cur_token) # WS token
+ case _:
+ self._add_pending_token(nl_token)
+ if is_looking_ahead: # We're on a whitespace(s) followed by a statement
+ indentation_length: int = 0 if self._ffg_token.type == Token.EOF else \
+ self._get_indentation_length(self._cur_token.text)
+
+ if indentation_length != INVALID_LENGTH:
+ self._add_pending_token(self._cur_token) # WS token
+ self._insert_INDENT_or_DEDENT_token(indentation_length) # may insert INDENT token or DEDENT token(s)
+ else:
+ self._report_error("inconsistent use of tabs and spaces in indentation")
+ else: # We're at a newline followed by a statement (there is no whitespace before the statement)
+ self._insert_INDENT_or_DEDENT_token(0) # may insert DEDENT token(s)
+
+ def _insert_INDENT_or_DEDENT_token(self, indent_length: int) -> None:
+ prev_indent_length: int = self._indent_length_stack[-1] # stack peek
+ if indent_length > prev_indent_length:
+ self._create_and_add_pending_token(self.INDENT, None, self._ffg_token)
+ self._indent_length_stack.append(indent_length) # stack push
+ return
+
+ while indent_length < prev_indent_length: # more than 1 DEDENT token may be inserted to the token stream
+ self._indent_length_stack.pop()
+ prev_indent_length = self._indent_length_stack[-1] # stack peek
+ if indent_length <= prev_indent_length:
+ self._create_and_add_pending_token(self.DEDENT, None, self._ffg_token)
+ else:
+ self._report_error("inconsistent dedent")
+
+ def _check_cur_token(self) -> None:
+ match self._cur_token.type:
+ case self.FSTRING_START:
+ self._cur_ISTRING_MIDDLE_token_type = self.FSTRING_MIDDLE
+ self._set_lexer_mode_by_ISTRING_START_token()
+ return
+ case self.TSTRING_START:
+ self._cur_ISTRING_MIDDLE_token_type = self.TSTRING_MIDDLE
+ self._set_lexer_mode_by_ISTRING_START_token()
+ return
+ case self.FSTRING_MIDDLE | self.TSTRING_MIDDLE:
+ self._handle_ISTRING_MIDDLE_token_with_quote_and_lbrace() # affect the opened field
+ match self._cur_token.type:
+ case self.FSTRING_MIDDLE | self.TSTRING_MIDDLE:
+ return # No _cur_token exchange happened
+ case self.FSTRING_END | self.TSTRING_END:
+ self._pop_lexer_mode()
+ return
+ case _:
+ if not self._lexer_mode_stack:
+ return # Not in fstring mode
+ self._process_brace_expression()
+
+ def _process_brace_expression(self) -> None:
+ match self._cur_token.type: # the following tokens can only come from default mode (after an LBRACE in f/t-string)
+ case self.NEWLINE:
+ # append the current brace expression with the current newline
+ self._append_to_brace_expression(self._cur_token.text)
+ self._cur_token.channel = Token.HIDDEN_CHANNEL
+ case self.LBRACE:
+ # the outermost brace expression cannot be a dictionary comprehension or a set comprehension
+ self._brace_expression_stack.append("{")
+ self._paren_or_bracket_opened_stack.append(0) # stack push
+ self._push_lexer_mode(Lexer.DEFAULT_MODE)
+ case self.LPAR | self.LSQB:
+ # append the current brace expression with a "(" or a "["
+ self._append_to_brace_expression(self._cur_token.text)
+ # https://peps.python.org/pep-0498/#lambdas-inside-expressions
+ self._increment_brace_stack()
+ case self.RPAR | self.RSQB:
+ # append the current brace expression with a ")" or a "]"
+ self._append_to_brace_expression(self._cur_token.text)
+ self._decrement_brace_stack()
+ case self.COLON | self.COLONEQUAL:
+ # append the current brace expression with a ":" or a ":="
+ self._append_to_brace_expression(self._cur_token.text)
+ self._set_lexer_mode_by_COLON_or_COLONEQUAL_token()
+ case self.RBRACE:
+ self._set_lexer_mode_after_RBRACE_token()
+ case _:
+ # append the current brace expression with the current token text
+ self._append_to_brace_expression(self._cur_token.text)
+
+ def _append_to_brace_expression(self, text: str) -> None:
+ self._brace_expression_stack[-1] += text
+
+ def _increment_brace_stack(self) -> None: # increment the last element (stack peek + 1)
+ self._paren_or_bracket_opened_stack[-1] += 1
+
+ def _decrement_brace_stack(self) -> None: # decrement the last element (stack peek - 1)
+ self._paren_or_bracket_opened_stack[-1] -= 1
+
+ def _set_lexer_mode_after_RBRACE_token(self) -> None:
+ match self._cur_lexer_mode:
+ case Lexer.DEFAULT_MODE:
+ self._pop_lexer_mode() # only once
+ self._pop_by_RBRACE()
+ case ( self.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE
+ | self.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE
+ | self.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE
+ | self.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE
+ | self.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE
+ | self.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE
+ | self.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE
+ | self.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE
+ | self.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE
+ | self.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE
+ | self.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE
+ | self.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE
+ | self.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE
+ | self.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE
+ | self.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE
+ | self.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE):
+
+ self._pop_lexer_mode()
+ self._pop_lexer_mode()
+ self._pop_by_RBRACE()
+ case _:
+ self._report_lexer_error("f-string: single '}' is not allowed")
+
+ def _set_lexer_mode_by_ISTRING_START_token(self) -> None:
+ # ISTRING = interpolated string (FSTRING or TSTRING)
+ if not PythonLexerBase._LEXER_MODES_FOR_ISTRING_START:
+ PythonLexerBase._init_lexer_modes_for_istring_start()
+
+ interpolated_string_prefix: str = self._cur_token.text.lower()
+ if interpolated_string_prefix in PythonLexerBase._LEXER_MODES_FOR_ISTRING_START:
+ new_lexer_mode: int = PythonLexerBase._LEXER_MODES_FOR_ISTRING_START[interpolated_string_prefix]
+ self._push_lexer_mode(new_lexer_mode)
+ else:
+ self._report_lexer_error(
+ f"internal error: unknown interpolated string literal prefix: {self._cur_token.text}"
+ )
+
+ @staticmethod
+ def _init_lexer_modes_for_istring_start() -> None:
+ # f-strings
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["f'"] = PythonLexer.PythonLexer.SQ1__FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["rf'"] = PythonLexer.PythonLexer.SQ1R_FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["fr'"] = PythonLexer.PythonLexer.SQ1R_FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['f"'] = PythonLexer.PythonLexer.DQ1__FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['rf"'] = PythonLexer.PythonLexer.DQ1R_FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['fr"'] = PythonLexer.PythonLexer.DQ1R_FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["f'''"] = PythonLexer.PythonLexer.SQ3__FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["rf'''"] = PythonLexer.PythonLexer.SQ3R_FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["fr'''"] = PythonLexer.PythonLexer.SQ3R_FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['f"""'] = PythonLexer.PythonLexer.DQ3__FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['rf"""'] = PythonLexer.PythonLexer.DQ3R_FSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['fr"""'] = PythonLexer.PythonLexer.DQ3R_FSTRING_MODE
+
+ # t-strings
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["t'"] = PythonLexer.PythonLexer.SQ1__TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["rt'"] = PythonLexer.PythonLexer.SQ1R_TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["tr'"] = PythonLexer.PythonLexer.SQ1R_TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['t"'] = PythonLexer.PythonLexer.DQ1__TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['rt"'] = PythonLexer.PythonLexer.DQ1R_TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['tr"'] = PythonLexer.PythonLexer.DQ1R_TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["t'''"] = PythonLexer.PythonLexer.SQ3__TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["rt'''"] = PythonLexer.PythonLexer.SQ3R_TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START["tr'''"] = PythonLexer.PythonLexer.SQ3R_TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['t"""'] = PythonLexer.PythonLexer.DQ3__TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['rt"""'] = PythonLexer.PythonLexer.DQ3R_TSTRING_MODE
+ PythonLexerBase._LEXER_MODES_FOR_ISTRING_START['tr"""'] = PythonLexer.PythonLexer.DQ3R_TSTRING_MODE
+
+ def _set_lexer_mode_by_COLON_or_COLONEQUAL_token(self) -> None:
+ # Exit early when the current lexer mode indicates an open parenthesis/bracket
+ opened: bool = self._paren_or_bracket_opened_stack[-1] > 0 # stack peek
+ if opened:
+ return
+
+ # COLONEQUAL token will be replaced with a COLON token in _check_next_token()
+ prevLexerMode = self._lexer_mode_stack[-1] # stack peek
+ match prevLexerMode:
+ case self.SQ1__FSTRING_MODE | self.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.SQ1__TSTRING_MODE | self.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.SQ1R_FSTRING_MODE | self.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.SQ1R_TSTRING_MODE | self.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.DQ1__FSTRING_MODE | self.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.DQ1__TSTRING_MODE | self.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.DQ1R_FSTRING_MODE | self.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.DQ1R_TSTRING_MODE | self.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.SQ3__FSTRING_MODE | self.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE)
+ case self.SQ3__TSTRING_MODE | self.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.SQ3R_FSTRING_MODE | self.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE)
+ case self.SQ3R_TSTRING_MODE | self.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.DQ3__FSTRING_MODE | self.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.DQ3__TSTRING_MODE | self.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE)
+ case self.DQ3R_FSTRING_MODE | self.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE)
+
+ case self.DQ3R_TSTRING_MODE | self.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ self._push_lexer_mode(self.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE)
+
+ def _pop_by_RBRACE(self) -> None:
+ self._paren_or_bracket_opened_stack.pop()
+ cur_brace_expression: str = self._brace_expression_stack.pop()
+ self._prev_brace_expression = cur_brace_expression + "}"
+ if self._brace_expression_stack:
+ # Extend the current brace expression by adding the previous expression
+ self._brace_expression_stack[-1] += self._prev_brace_expression
+
+ def _handle_ISTRING_MIDDLE_token_with_double_brace(self) -> None:
+ # ISTRING = interpolated string (FSTRING or TSTRING)
+ last_two_chars: str = self._get_last_two_chars_of_the_cur_token_text()
+ match last_two_chars:
+ case "{{":
+ self._trim_last_char_add_pending_token_set_cur_token(self.LBRACE, "{", Token.HIDDEN_CHANNEL)
+ case "}}":
+ self._trim_last_char_add_pending_token_set_cur_token(self.RBRACE, "}", Token.HIDDEN_CHANNEL)
+
+ def _handle_ISTRING_MIDDLE_token_with_quote_and_lbrace(self) -> None: # ISTRING = interpolated string (FSTRING or TSTRING)
+ # replace the trailing quote + left_brace with a quote and insert an LBRACE token
+ # replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
+ last_two_chars: str = self._get_last_two_chars_of_the_cur_token_text()
+ match last_two_chars:
+ case "\"{" | "'{" | "\\{":
+ self._trim_last_char_add_pending_token_set_cur_token(self.LBRACE, "{", Token.DEFAULT_CHANNEL)
+
+ def _get_last_two_chars_of_the_cur_token_text(self) -> str:
+ text: str = self._cur_token.text
+ return text[-2:] if len(text) >= 2 else text
+
+ def _trim_last_char_add_pending_token_set_cur_token(self, type: int, text: str, channel: int) -> None:
+ # trim the last char and add the modified curToken to the _pending_tokens stack
+ token_text_without_lbrace: str = self._cur_token.text[:-1]
+ self._cur_token.text = token_text_without_lbrace
+ self._cur_token.stop -= 1
+ self._add_pending_token(self._cur_token)
+
+ self._create_new_cur_token(type, text, channel) # set _cur_token
+
+ def _handle_COLONEQUAL_token_in_istring(self) -> None: # istring = interpolated string (FSTRING or TSTRING)
+ if self._lexer_mode_stack \
+ and self._paren_or_bracket_opened_stack[-1] == 0: # stack peek == 0
+
+ # In an f/t-string, the walrus operator (:=) is only allowed inside parentheses.
+ # If used outside, split the COLONEQUAL token into a COLON
+ # (used as a format specifier instead of a walrus operator),
+ # and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE).
+ self._cur_token.type = self.COLON
+ self._cur_token.text = ":"
+ self._cur_token.stop = self._cur_token.start
+
+ match self._ffg_token.type:
+ case self.FSTRING_MIDDLE | self.TSTRING_MIDDLE:
+ token: CommonToken = self._ffg_token.clone()
+ token.text = "=" + token.text
+ token.start -= 1
+ token.column -= 1
+ self._ffg_token = token
+ case _:
+ self._add_pending_token(self._cur_token)
+ self._create_new_cur_token(self._cur_ISTRING_MIDDLE_token_type, "=", Token.DEFAULT_CHANNEL)
+ self._add_pending_token(self._cur_token)
+
+ def _create_new_cur_token(self, type: int, text: str, channel: int) -> None:
+ token: CommonToken = self._cur_token.clone()
+ token.type = type
+ token.text = text
+ token.channel = channel
+ token.column += 1
+ token.start += 1
+ token.stop = token.start
+ self._cur_token = token
+
+ def _push_lexer_mode(self, mode: int) -> None:
+ self.pushMode(mode)
+ self._lexer_mode_stack.append(self._cur_lexer_mode) # stack push
+ self._cur_lexer_mode = mode
+
+ def _pop_lexer_mode(self) -> None:
+ self.popMode()
+ self._cur_lexer_mode = self._lexer_mode_stack.pop()
+
+ def _handle_FORMAT_SPECIFICATION_MODE(self) -> None:
+ if not self._lexer_mode_stack or self._ffg_token.type != self.RBRACE:
+ return
+
+ # insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification
+ match self._cur_token.type:
+ case self.COLON:
+ self._create_and_add_pending_token(self._cur_ISTRING_MIDDLE_token_type, "", self._ffg_token)
+ case self.RBRACE:
+ # only when the previous brace expression is not a dictionary comprehension or set comprehension
+ if not self._is_valid_dictionary_or_set_comprehension_expression(self._prev_brace_expression):
+ self._create_and_add_pending_token(self._cur_ISTRING_MIDDLE_token_type, "", self._ffg_token)
+
+ def _is_valid_dictionary_or_set_comprehension_expression(self, code: str) -> bool:
+ from antlr4 import InputStream, CommonTokenStream
+ from PythonLexer import PythonLexer
+ from PythonParser import PythonParser
+
+ input_stream: InputStream = InputStream(code)
+ lexer: PythonLexer = PythonLexer(input_stream)
+ token_stream: CommonTokenStream = CommonTokenStream(lexer)
+ parser: PythonParser = PythonParser(token_stream)
+
+ # Disable error listeners to suppress console output
+ lexer.removeErrorListeners()
+ parser.removeErrorListeners()
+
+ parser.dictcomp() # Try parsing as dictionary comprehension
+ if parser.getNumberOfSyntaxErrors() == 0:
+ return True
+
+ parser = PythonParser(token_stream)
+ token_stream.seek(0)
+ parser.removeErrorListeners()
+ parser.setcomp() # Try parsing as set comprehension
+ return parser.getNumberOfSyntaxErrors() == 0
+
+ def _insert_trailing_tokens(self) -> None:
+ match self._last_pending_token_type_from_default_channel:
+ case self.NEWLINE | self.DEDENT:
+ pass # no trailing NEWLINE token is needed
+ case _: # insert an extra trailing NEWLINE token that serves as the end of the last statement
+ self._create_and_add_pending_token(self.NEWLINE, None, self._ffg_token) # _ffg_token is EOF
+ self._insert_INDENT_or_DEDENT_token(0) # Now insert as much trailing DEDENT tokens as needed
+
+ def _handle_EOF_token(self) -> None:
+ if self._last_pending_token_type_from_default_channel > 0:
+ # there was statement in the input (leading NEWLINE tokens are hidden)
+ self._insert_trailing_tokens()
+ self._add_pending_token(self._cur_token)
+
+ def _hide_and_add_pending_token(self, original_token: CommonToken) -> None:
+ original_token.channel = Token.HIDDEN_CHANNEL
+ self._add_pending_token(original_token)
+
+ def _create_and_add_pending_token(self, ttype: int, text: Optional[str], original_token: CommonToken) -> None:
+ token: CommonToken = original_token.clone()
+ token.type = ttype
+ token.channel = Token.DEFAULT_CHANNEL
+ token.stop = original_token.start - 1
+ token.text = "<" + self.symbolicNames[ttype] + ">" if text is None else \
+ text
+
+ self._add_pending_token(token)
+
+ def _add_pending_token(self, token: CommonToken) -> None:
+ # save the last pending token type because the _pending_tokens list can be empty by the nextToken()
+ self._previous_pending_token_type = token.type
+ if token.channel == Token.DEFAULT_CHANNEL:
+ self._last_pending_token_type_from_default_channel = self._previous_pending_token_type
+ self._pending_tokens.append(token)
+
+ def _get_indentation_length(self, indentText: str) -> int: # the indentText may contain spaces, tabs or form feeds
+ length: int = 0
+ ch: str
+ for ch in indentText:
+ match ch:
+ case ' ':
+ self._was_space_indentation = True
+ length += 1
+ case '\t':
+ self._was_tab_indentation = True
+ length += PythonLexerBase.TAB_LENGTH - (length % PythonLexerBase.TAB_LENGTH)
+ case '\f': # form feed
+ length = 0
+
+ if self._was_tab_indentation and self._was_space_indentation:
+ if not self._was_indentation_mixed_with_spaces_and_tabs:
+ self._was_indentation_mixed_with_spaces_and_tabs = True
+ length = INVALID_LENGTH # only for the first inconsistent indent
+ return length
+
+ def _report_lexer_error(self, err_msg: str) -> None:
+ self.getErrorListenerDispatch().syntaxError(self, self._cur_token.type, self._cur_token.line, self._cur_token.column, " LEXER" + ERR_TXT + err_msg, None)
+
+ def _report_error(self, err_msg: str) -> None:
+ self._report_lexer_error(err_msg)
+
+ self._create_and_add_pending_token(self.ERRORTOKEN, ERR_TXT + err_msg, self._ffg_token)
+ # the ERRORTOKEN also triggers a parser error
diff --git a/python/python3_13/Python3_13_2_official_grammar.peg b/python/python3_14/Python3_14_2_official_grammar.peg
similarity index 94%
rename from python/python3_13/Python3_13_2_official_grammar.peg
rename to python/python3_14/Python3_14_2_official_grammar.peg
index e774b1e92b..27990f68f2 100644
--- a/python/python3_13/Python3_13_2_official_grammar.peg
+++ b/python/python3_14/Python3_14_2_official_grammar.peg
@@ -56,7 +56,7 @@
# ~
# Commit to the current alternative, even if it fails to parse.
# &&e
-# Eager parse e. The parser will not backtrack and will immediately
+# Eager parse e. The parser will not backtrack and will immediately
# fail with SyntaxError if e cannot be parsed.
#
@@ -73,10 +73,15 @@ func_type: '(' [type_expressions] ')' '->' expression NEWLINE* ENDMARKER
statements: statement+
-statement: compound_stmt | simple_stmts
+statement:
+ | compound_stmt
+ | simple_stmts
+
+single_compound_stmt:
+ | compound_stmt
statement_newline:
- | compound_stmt NEWLINE
+ | single_compound_stmt NEWLINE
| simple_stmts
| NEWLINE
| ENDMARKER
@@ -94,12 +99,12 @@ simple_stmt:
| return_stmt
| import_stmt
| raise_stmt
- | 'pass'
+ | pass_stmt
| del_stmt
| yield_stmt
| assert_stmt
- | 'break'
- | 'continue'
+ | break_stmt
+ | continue_stmt
| global_stmt
| nonlocal_stmt
@@ -121,8 +126,8 @@ assignment:
| NAME ':' expression ['=' annotated_rhs ]
| ('(' single_target ')'
| single_subscript_attribute_target) ':' expression ['=' annotated_rhs ]
- | (star_targets '=' )+ (yield_expr | star_expressions) !'=' [TYPE_COMMENT]
- | single_target augassign ~ (yield_expr | star_expressions)
+ | (star_targets '=' )+ annotated_rhs !'=' [TYPE_COMMENT]
+ | single_target augassign ~ annotated_rhs
annotated_rhs: yield_expr | star_expressions
@@ -148,6 +153,15 @@ raise_stmt:
| 'raise' expression ['from' expression ]
| 'raise'
+pass_stmt:
+ | 'pass'
+
+break_stmt:
+ | 'break'
+
+continue_stmt:
+ | 'continue'
+
global_stmt: 'global' ','.NAME+
nonlocal_stmt: 'nonlocal' ','.NAME+
@@ -179,10 +193,12 @@ import_from_as_names:
| ','.import_from_as_name+
import_from_as_name:
| NAME ['as' NAME ]
+
dotted_as_names:
| ','.dotted_as_name+
dotted_as_name:
| dotted_name ['as' NAME ]
+
dotted_name:
| dotted_name '.' NAME
| NAME
@@ -334,10 +350,14 @@ try_stmt:
# ----------------
except_block:
- | 'except' expression ['as' NAME ] ':' block
+ | 'except' expression ':' block
+ | 'except' expression 'as' NAME ':' block
+ | 'except' expressions ':' block
| 'except' ':' block
except_star_block:
- | 'except' '*' expression ['as' NAME ] ':' block
+ | 'except' '*' expression ':' block
+ | 'except' '*' expression 'as' NAME ':' block
+ | 'except' '*' expressions ':' block
finally_block:
| 'finally' ':' block
@@ -495,8 +515,7 @@ type_alias:
# Type parameter declaration
# --------------------------
-type_params:
- | invalid_type_params
+type_params:
| '[' type_param_seq ']'
type_param_seq: ','.type_param+ [',']
@@ -742,8 +761,25 @@ fstring_format_spec:
fstring:
| FSTRING_START fstring_middle* FSTRING_END
+tstring_format_spec_replacement_field:
+ | '{' annotated_rhs '='? [fstring_conversion] [tstring_full_format_spec] '}'
+tstring_format_spec:
+ | TSTRING_MIDDLE
+ | tstring_format_spec_replacement_field
+tstring_full_format_spec:
+ | ':' tstring_format_spec*
+tstring_replacement_field:
+ | '{' annotated_rhs '='? [fstring_conversion] [tstring_full_format_spec] '}'
+tstring_middle:
+ | tstring_replacement_field
+ | TSTRING_MIDDLE
+tstring:
+ | TSTRING_START tstring_middle* TSTRING_END
+
string: STRING
-strings: (fstring|string)+
+strings:
+ | (fstring|string)+
+ | tstring+
list:
| '[' [star_named_expressions] ']'
diff --git a/python/python3_13/PythonLexer.g4 b/python/python3_14/PythonLexer.g4
similarity index 61%
rename from python/python3_13/PythonLexer.g4
rename to python/python3_14/PythonLexer.g4
index 98b99d4aef..a6ed067c3c 100644
--- a/python/python3_13/PythonLexer.g4
+++ b/python/python3_14/PythonLexer.g4
@@ -21,35 +21,41 @@ THE SOFTWARE.
*/
/*
- * Project : an ANTLR4 lexer grammar for Python 3
- * https://github.com/RobEin/ANTLR4-parser-for-Python-3.13
+ * Project : an ANTLR4 lexer grammar for Python 3 programming language
+ * https://github.com/RobEin/ANTLR4-parser-for-Python-3.14
* Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com
*/
-// https://docs.python.org/3.13/reference/lexical_analysis.html
-
+// https://docs.python.org/3.14/reference/lexical_analysis.html
lexer grammar PythonLexer;
+// the helper class for this grammar that assists in tokenizing indentation, interpolated strings, and the encoding declaration
options { superClass=PythonLexerBase; }
tokens {
- ENCODING // https://docs.python.org/3.13/reference/lexical_analysis.html#encoding-declarations
- , INDENT, DEDENT // https://docs.python.org/3.13/reference/lexical_analysis.html#indentation
- , TYPE_COMMENT // not supported, only for compatibility with the PythonParser.g4 grammar
+ ENCODING // https://docs.python.org/3.14/reference/lexical_analysis.html#encoding-declarations
+ , INDENT, DEDENT // https://docs.python.org/3.14/reference/lexical_analysis.html#indentation
+ , TYPE_COMMENT // not supported, only for compatibility with the parser grammar
, FSTRING_START, FSTRING_MIDDLE, FSTRING_END // https://peps.python.org/pep-0701/#specification
+ , TSTRING_START, TSTRING_MIDDLE, TSTRING_END // https://peps.python.org/pep-0750/#specification
}
/*
* default lexer mode
*/
-// https://docs.python.org/3.13/library/token.html#module-token
-LPAR : '('; // OPEN_PAREN
-LSQB : '['; // OPEN_BRACK
-LBRACE : '{'; // OPEN_BRACE
-RPAR : ')'; // CLOSE_PAREN
-RSQB : ']'; // CLOSE_BRACK
-RBRACE : '}'; // CLOSE_BRACE
+// https://docs.python.org/3.14/reference/lexical_analysis.html#encoding-declarations
+BOM : '\uFEFF';
+// The BOM unicode character indicates that a BOM byte sequence (for Python is only UTF‑8: EF BB BF) was present at the start of the file.
+// It is not part of Python source code and is therefore skipped in PythonLexerBase.
+
+// https://docs.python.org/3.14/library/token.html#module-token
+LPAR : '(';
+LSQB : '[';
+LBRACE : '{';
+RPAR : ')';
+RSQB : ']';
+RBRACE : '}';
DOT : '.';
COLON : ':';
COMMA : ',';
@@ -93,7 +99,7 @@ ELLIPSIS : '...';
COLONEQUAL : ':=';
EXCLAMATION : '!';
-// https://docs.python.org/3.13/reference/lexical_analysis.html#keywords
+// https://docs.python.org/3.14/reference/lexical_analysis.html#keywords
FALSE : 'False';
AWAIT : 'await';
ELSE : 'else';
@@ -130,157 +136,296 @@ IF : 'if';
OR : 'or';
YIELD : 'yield';
-// *** Soft Keywords: https://docs.python.org/3.13/reference/lexical_analysis.html#soft-keywords
-NAME_OR_TYPE : 'type'; // identifier or type keyword, the parser grammar will decide what it means
-NAME_OR_MATCH : 'match'; // identifier or match keyword, the parser grammar will decide what it means
-NAME_OR_CASE : 'case'; // identifier or case keyword, the parser grammar will decide what it means
-NAME_OR_WILDCARD : '_'; // identifier or wildcard symbol, the parser grammar will decide what it means
+// *** Soft Keywords: https://docs.python.org/3.14/reference/lexical_analysis.html#soft-keywords
+ // the parser grammar determines whether it is an ...
+NAME_OR_TYPE : 'type'; // ... identifier or a type keyword, depending on the source code context
+NAME_OR_MATCH : 'match'; // ... identifier or a match keyword, depending on the source code context
+NAME_OR_CASE : 'case'; // ... identifier or a case keyword, depending on the source code context
+NAME_OR_WILDCARD : '_'; // ... identifier or a wildcard symbol, depending on the source code context
-// https://docs.python.org/3.13/reference/lexical_analysis.html#identifiers
+// https://docs.python.org/3.14/reference/lexical_analysis.html#identifiers
NAME : ID_START ID_CONTINUE*;
-// https://docs.python.org/3.13/reference/lexical_analysis.html#numeric-literals
+// https://docs.python.org/3.14/reference/lexical_analysis.html#numeric-literals
NUMBER
: INTEGER
| FLOAT_NUMBER
| IMAG_NUMBER
;
-// https://docs.python.org/3.13/reference/lexical_analysis.html#string-and-bytes-literals
+// https://docs.python.org/3.14/reference/lexical_analysis.html#string-and-bytes-literals
STRING
: STRING_LITERAL
| BYTES_LITERAL
;
-// https://docs.python.org/3.13/reference/lexical_analysis.html#physical-lines
+// https://docs.python.org/3.14/reference/lexical_analysis.html#physical-lines
NEWLINE : '\r'? '\n'; // Unix, Windows
-// https://docs.python.org/3.13/reference/lexical_analysis.html#comments
+// https://docs.python.org/3.14/reference/lexical_analysis.html#comments
COMMENT : '#' ~[\r\n]* -> channel(HIDDEN);
-// https://docs.python.org/3.13/reference/lexical_analysis.html#whitespace-between-tokens
+// https://docs.python.org/3.14/reference/lexical_analysis.html#whitespace-between-tokens
WS : [ \t\f]+ -> channel(HIDDEN);
-// https://docs.python.org/3.13/reference/lexical_analysis.html#explicit-line-joining
+// https://docs.python.org/3.14/reference/lexical_analysis.html#explicit-line-joining
EXPLICIT_LINE_JOINING : BACKSLASH_NEWLINE -> channel(HIDDEN);
+// https://docs.python.org/3.14/reference/lexical_analysis.html#formatted-string-literals
+FSTRING_START : FSTRING_PREFIX STRING_QUOTES; // pushMode(...._FSTRING_MODE) is called in PythonLexerBase
+TSTRING_START : TSTRING_PREFIX STRING_QUOTES; // pushMode(...._TSTRING_MODE) is called in PythonLexerBase
-// *************************
-// abbreviations for FSTRING
-// *************************
-// SQ1__FSTRING = short single quoted formatted string: f'abc'
-// DQ1__FSTRING = short double quoted formatted string: f"abc"
-// SQ1R_FSTRING = short single quoted raw formatted string: rf'abc'
-// DQ1R_FSTRING = short double quoted raw formatted string: rf"abc"
-//
-// SQ3__FSTRING = long single quoted formatted string: f'''abc'''
-// DQ3__FSTRING = long double quoted formatted string: f"""abc"""
-// SQ3R_FSTRING = long single quoted raw formatted string: rf'''abc'''
-// DQ3R_FSTRING = long double quoted raw formatted string: rf"""abc"""
-
-// https://docs.python.org/3.13/reference/lexical_analysis.html#formatted-string-literals
-FSTRING_START : FSTRING_PREFIX ([']
- | ["]
- | [']['][']
- | ["]["]["])
- ; // pushMode(????_FSTRING_MODE) will be called in PythonLexerBase class
-
-// catch the unrecognized characters
-ERRORTOKEN : . ; // PythonLexerBase class will report an error about this (the ERRORTOKEN will also cause an error in the parser)
-
+// catch unrecognized characters
+ERRORTOKEN : . ; // the PythonLexerBase class reports a lexer error for them (ERRORTOKEN also triggers a parser error)
/*
- * other lexer modes
+ * lexer modes for interpolation string literals
*/
+// **********************************************************************
+// Abbreviations for interpolation string literals (f-strings, t-strings)
+// **********************************************************************
+// SQ1__ISTRING = short single quoted interpolation string, e.g.: f'Hello {name}'
+// DQ1__ISTRING = short double quoted interpolation string, e.g.: f"Hello {name}"
+// SQ1R_ISTRING = short single quoted raw interpolation string, e.g.: rf'Hello {name}'
+// DQ1R_ISTRING = short double quoted raw interpolation string, e.g.: rf"Hello {name}"
+//
+// SQ3__ISTRING = long single quoted interpolation string, e.g.: f'''Hello {name}'''
+// DQ3__ISTRING = long double quoted interpolation string, e.g.: f"""Hello {name}"""
+// SQ3R_ISTRING = long single quoted raw interpolation string, e.g.: rf'''Hello {name}'''
+// DQ3R_ISTRING = long double quoted raw interpolation string, e.g.: rf"""Hello {name}"""
+
mode SQ1__FSTRING_MODE;
- SQ1__FSTRING_END : ['] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class
- SQ1__FSTRING_MIDDLE : SQ1__FSTRING_ITEM -> type(FSTRING_MIDDLE);
- SQ1__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1__FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class
+ SQ1__FSTRING_END : ['] -> type(FSTRING_END); // popMode() is called in PythonLexerBase
+ SQ1__FSTRING_MIDDLE : SQ1__ISTRING_ITEM -> type(FSTRING_MIDDLE);
+ SQ1__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1__FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ SQ1__FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode SQ1__TSTRING_MODE;
+ SQ1__TSTRING_END : ['] -> type(TSTRING_END); // popMode() is called in PythonLexerBase
+ SQ1__TSTRING_MIDDLE : SQ1__ISTRING_ITEM -> type(TSTRING_MIDDLE);
+ SQ1__TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1__TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ SQ1__TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
mode SQ1R_FSTRING_MODE;
- SQ1R_FSTRING_END : ['] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class
- SQ1R_FSTRING_MIDDLE : SQ1R_FSTRING_ITEM -> type(FSTRING_MIDDLE);
- SQ1R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1R_FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class
+ SQ1R_FSTRING_END : ['] -> type(FSTRING_END); // popMode() is called in PythonLexerBase
+ SQ1R_FSTRING_MIDDLE : SQ1R_ISTRING_ITEM -> type(FSTRING_MIDDLE);
+ SQ1R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ SQ1R_FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode SQ1R_TSTRING_MODE;
+ SQ1R_TSTRING_END : ['] -> type(TSTRING_END); // popMode() is called in PythonLexerBase
+ SQ1R_TSTRING_MIDDLE : SQ1R_ISTRING_ITEM -> type(TSTRING_MIDDLE);
+ SQ1R_TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ SQ1R_TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
mode DQ1__FSTRING_MODE;
- DQ1__FSTRING_END : ["] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class
- DQ1__FSTRING_MIDDLE : DQ1__FSTRING_ITEM -> type(FSTRING_MIDDLE);
- DQ1__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1__FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class
+ DQ1__FSTRING_END : ["] -> type(FSTRING_END); // popMode() is called in PythonLexerBase
+ DQ1__FSTRING_MIDDLE : DQ1__ISTRING_ITEM -> type(FSTRING_MIDDLE);
+ DQ1__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1__FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ DQ1__FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode DQ1__TSTRING_MODE;
+ DQ1__TSTRING_END : ["] -> type(TSTRING_END); // popMode() is called in PythonLexerBase
+ DQ1__TSTRING_MIDDLE : DQ1__ISTRING_ITEM -> type(TSTRING_MIDDLE);
+ DQ1__TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1__TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ DQ1__TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
mode DQ1R_FSTRING_MODE;
- DQ1R_FSTRING_END : ["] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class
- DQ1R_FSTRING_MIDDLE : DQ1R_FSTRING_ITEM -> type(FSTRING_MIDDLE);
- DQ1R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1R_FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class
+ DQ1R_FSTRING_END : ["] -> type(FSTRING_END); // popMode() is called in PythonLexerBase
+ DQ1R_FSTRING_MIDDLE : DQ1R_ISTRING_ITEM -> type(FSTRING_MIDDLE);
+ DQ1R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ DQ1R_FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode DQ1R_TSTRING_MODE;
+ DQ1R_TSTRING_END : ["] -> type(TSTRING_END); // popMode() is called in PythonLexerBase
+ DQ1R_TSTRING_MIDDLE : DQ1R_ISTRING_ITEM -> type(TSTRING_MIDDLE);
+ DQ1R_TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ DQ1R_TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
mode SQ3__FSTRING_MODE;
- SQ3__FSTRING_END : ['][']['] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class
- SQ3__FSTRING_MIDDLE : SQ3__FSTRING_ITEM -> type(FSTRING_MIDDLE);
- SQ3__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3__FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class
+ SQ3__FSTRING_END : ['][']['] -> type(FSTRING_END); // popMode() is called in PythonLexerBase
+ SQ3__FSTRING_MIDDLE : SQ3__ISTRING_ITEM -> type(FSTRING_MIDDLE);
+ SQ3__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3__FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ SQ3__FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode SQ3__TSTRING_MODE;
+ SQ3__TSTRING_END : ['][']['] -> type(TSTRING_END); // popMode() is called in PythonLexerBase
+ SQ3__TSTRING_MIDDLE : SQ3__ISTRING_ITEM -> type(TSTRING_MIDDLE);
+ SQ3__TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3__TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ SQ3__TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
mode SQ3R_FSTRING_MODE;
- SQ3R_FSTRING_END : ['][']['] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class
- SQ3R_FSTRING_MIDDLE : SQ3R_FSTRING_ITEM -> type(FSTRING_MIDDLE);
- SQ3R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3R_FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class
+ SQ3R_FSTRING_END : ['][']['] -> type(FSTRING_END); // popMode() is called in PythonLexerBase
+ SQ3R_FSTRING_MIDDLE : SQ3R_ISTRING_ITEM -> type(FSTRING_MIDDLE);
+ SQ3R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ SQ3R_FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode SQ3R_TSTRING_MODE;
+ SQ3R_TSTRING_END : ['][']['] -> type(TSTRING_END); // popMode() is called in PythonLexerBase
+ SQ3R_TSTRING_MIDDLE : SQ3R_ISTRING_ITEM -> type(TSTRING_MIDDLE);
+ SQ3R_TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ SQ3R_TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
mode DQ3__FSTRING_MODE;
- DQ3__FSTRING_END : ["]["]["] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class
- DQ3__FSTRING_MIDDLE : DQ3__FSTRING_ITEM -> type(FSTRING_MIDDLE);
- DQ3__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3__FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class
+ DQ3__FSTRING_END : ["]["]["] -> type(FSTRING_END); // popMode() is called in PythonLexerBase
+ DQ3__FSTRING_MIDDLE : DQ3__ISTRING_ITEM -> type(FSTRING_MIDDLE);
+ DQ3__FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3__FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ DQ3__FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode DQ3__TSTRING_MODE;
+ DQ3__TSTRING_END : ["]["]["] -> type(TSTRING_END); // popMode() is called in PythonLexerBase
+ DQ3__TSTRING_MIDDLE : DQ3__ISTRING_ITEM -> type(TSTRING_MIDDLE);
+ DQ3__TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3__TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ DQ3__TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
mode DQ3R_FSTRING_MODE;
- DQ3R_FSTRING_END : ["]["]["] -> type(FSTRING_END); // popMode will be called in PythonLexerBase class
- DQ3R_FSTRING_MIDDLE : DQ3R_FSTRING_ITEM -> type(FSTRING_MIDDLE);
- DQ3R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3R_FORMAT_SPECIFICATION_MODE) will be called in PythonLexerBase class
-
-
-mode SQ1__FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon
- SQ1__FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ1__FSTRING_PART+ -> type(FSTRING_MIDDLE);
- SQ1__FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class
- SQ1__FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to SQ1__FSTRING_MODE by PythonLexerBase class
-
-mode SQ1R_FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon
- SQ1R_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ1R_FSTRING_PART+ -> type(FSTRING_MIDDLE);
- SQ1R_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class
- SQ1R_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to SQ1R_FSTRING_MODEby PythonLexerBase class
-
-mode DQ1__FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon
- DQ1__FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ1__FSTRING_PART+ -> type(FSTRING_MIDDLE);
- DQ1__FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class
- DQ1__FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to DQ1__FSTRING_MODE by PythonLexerBase class
-
-mode DQ1R_FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon
- DQ1R_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ1R_FSTRING_PART+ -> type(FSTRING_MIDDLE);
- DQ1R_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class
- DQ1R_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to DQ1R_FSTRING_MODE by PythonLexerBase class
-
-mode SQ3__FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon
- SQ3__FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ3__FSTRING_PART+ -> type(FSTRING_MIDDLE);
- SQ3__FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class
- SQ3__FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to SQ3__FSTRING_MODE by PythonLexerBase class
-
-mode SQ3R_FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon
- SQ3R_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ3R_FSTRING_PART+ -> type(FSTRING_MIDDLE);
- SQ3R_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class
- SQ3R_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to SQ3R_FSTRING_MODE by PythonLexerBase class
-
-mode DQ3__FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon
- DQ3__FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ3__FSTRING_PART+ -> type(FSTRING_MIDDLE);
- DQ3__FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class
- DQ3__FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to DQ3__FSTRING_MODE by PythonLexerBase class
-
-mode DQ3R_FORMAT_SPECIFICATION_MODE; // it is only used after a format specifier colon
- DQ3R_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ3R_FSTRING_PART+ -> type(FSTRING_MIDDLE);
- DQ3R_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // will be closed in DEFAULT_MODE by PythonLexerBase class
- DQ3R_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode to DQ3R_FSTRING_MODE by PythonLexerBase class
+ DQ3R_FSTRING_END : ["]["]["] -> type(FSTRING_END); // popMode() is called in PythonLexerBase
+ DQ3R_FSTRING_MIDDLE : DQ3R_ISTRING_ITEM -> type(FSTRING_MIDDLE);
+ DQ3R_FSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ DQ3R_FSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode DQ3R_TSTRING_MODE;
+ DQ3R_TSTRING_END : ["]["]["] -> type(TSTRING_END); // popMode() is called in PythonLexerBase
+ DQ3R_TSTRING_MIDDLE : DQ3R_ISTRING_ITEM -> type(TSTRING_MIDDLE);
+ DQ3R_TSTRING_LBRACE : '{' -> type(LBRACE); // pushMode(DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE) is called in PythonLexerBase
+ DQ3R_TSTRING_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
+
+// *** format specification modes for interpolated strings ***
+mode SQ1__FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ SQ1__FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ1__ISTRING_PART+ -> type(FSTRING_MIDDLE);
+ SQ1__FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ SQ1__FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ1__FSTRING_MODE is called in PythonLexerBase
+ SQ1__FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode SQ1__TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ SQ1__TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : SQ1__ISTRING_PART+ -> type(TSTRING_MIDDLE);
+ SQ1__TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ SQ1__TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ1__TSTRING_MODE is called in PythonLexerBase
+ SQ1__TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
+
+mode SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ SQ1R_FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ1R_ISTRING_PART+ -> type(FSTRING_MIDDLE);
+ SQ1R_FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ SQ1R_FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ1R_FSTRING_MODE is called in PythonLexerBase
+ SQ1R_FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ SQ1R_TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : SQ1R_ISTRING_PART+ -> type(TSTRING_MIDDLE);
+ SQ1R_TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ SQ1R_TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ1R_TSTRING_MODE is called in PythonLexerBase
+ SQ1R_TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
+
+mode DQ1__FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ DQ1__FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ1__ISTRING_PART+ -> type(FSTRING_MIDDLE);
+ DQ1__FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ DQ1__FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ1__FSTRING_MODE is called in PythonLexerBase
+ DQ1__FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode DQ1__TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ DQ1__TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : DQ1__ISTRING_PART+ -> type(TSTRING_MIDDLE);
+ DQ1__TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ DQ1__TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ1__TSTRING_MODE is called in PythonLexerBase
+ DQ1__TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
+
+mode DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ DQ1R_FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ1R_ISTRING_PART+ -> type(FSTRING_MIDDLE);
+ DQ1R_FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ DQ1R_FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ1R_FSTRING_MODE is called in PythonLexerBase
+ DQ1R_FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ DQ1R_TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : DQ1R_ISTRING_PART+ -> type(TSTRING_MIDDLE);
+ DQ1R_TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ DQ1R_TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ1R_TSTRING_MODE is called in PythonLexerBase
+ DQ1R_TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
+
+mode SQ3__FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ SQ3__FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ3__ISTRING_PART+ -> type(FSTRING_MIDDLE);
+ SQ3__FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ SQ3__FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ3__FSTRING_MODE is called in PythonLexerBase
+ SQ3__FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode SQ3__TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ SQ3__TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : SQ3__ISTRING_PART+ -> type(TSTRING_MIDDLE);
+ SQ3__TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ SQ3__TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ3__TSTRING_MODE is called in PythonLexerBase
+ SQ3__TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
+
+mode SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ SQ3R_FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : SQ3R_ISTRING_PART+ -> type(FSTRING_MIDDLE);
+ SQ3R_FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ SQ3R_FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ3R_FSTRING_MODE is called in PythonLexerBase
+ SQ3R_FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ SQ3R_TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : SQ3R_ISTRING_PART+ -> type(TSTRING_MIDDLE);
+ SQ3R_TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ SQ3R_TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to SQ3R_TSTRING_MODE is called in PythonLexerBase
+ SQ3R_TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
+
+mode DQ3__FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ DQ3__FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ3__ISTRING_PART+ -> type(FSTRING_MIDDLE);
+ DQ3__FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ DQ3__FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ3__FSTRING_MODE is called in PythonLexerBase
+ DQ3__FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode DQ3__TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ DQ3__TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : DQ3__ISTRING_PART+ -> type(TSTRING_MIDDLE);
+ DQ3__TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ DQ3__TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ3__TSTRING_MODE is called in PythonLexerBase
+ DQ3__TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+
+
+mode DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ DQ3R_FSTRING_FORMAT_SPECIFICATION_FSTRING_MIDDLE : DQ3R_ISTRING_PART+ -> type(FSTRING_MIDDLE);
+ DQ3R_FSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ DQ3R_FSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ3R_FSTRING_MODE is called in PythonLexerBase
+ DQ3R_FSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
+mode DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE; // called from the PythonLexerBase class; only used after a format specifier colon
+ DQ3R_TSTRING_FORMAT_SPECIFICATION_TSTRING_MIDDLE : DQ3R_ISTRING_PART+ -> type(TSTRING_MIDDLE);
+ DQ3R_TSTRING_FORMAT_SPECIFICATION_LBRACE : '{' -> type(LBRACE); // closed in DEFAULT_MODE by the PythonLexerBase class
+ DQ3R_TSTRING_FORMAT_SPECIFICATION_RBRACE : '}' -> type(RBRACE); // popMode() to DQ3R_TSTRING_MODE is called in PythonLexerBase
+ DQ3R_TSTRING_FORMAT_SPECIFICATION_ERRORTOKEN : . -> type(ERRORTOKEN);
+
/*
* fragments
*/
-// https://docs.python.org/3.13/reference/lexical_analysis.html#literals
+// https://docs.python.org/3.14/reference/lexical_analysis.html#literals
//
-// https://docs.python.org/3.13/reference/lexical_analysis.html#string-and-bytes-literals
+// https://docs.python.org/3.14/reference/lexical_analysis.html#string-and-bytes-literals
fragment STRING_LITERAL : STRING_PREFIX? (SHORT_STRING | LONG_STRING);
fragment STRING_PREFIX options { caseInsensitive=true; } : 'r' | 'u'; // 'r' | 'u' | 'R' | 'U'
@@ -306,7 +451,7 @@ fragment LONG__STRING_CHAR : ~[\\]; //
-// https://docs.python.org/3.13/reference/lexical_analysis.html#string-and-bytes-literals
+// https://docs.python.org/3.14/reference/lexical_analysis.html#string-and-bytes-literals
fragment BYTES_LITERAL : BYTES_PREFIX (SHORT_BYTES | LONG_BYTES);
fragment BYTES_PREFIX options { caseInsensitive=true; } : 'b' | 'br' | 'rb'; // 'b' | 'B' | 'br' | 'Br' | 'bR' | 'BR' | 'rb' | 'rB' | 'Rb' | 'RB'
@@ -344,57 +489,69 @@ fragment SHORT_DOUBLE_QUOTED_BYTES_CHAR //
fragment BYTES_ESCAPE_SEQ : '\\' [\u0000-\u007F]; // "\"
-// https://docs.python.org/3.13/reference/lexical_analysis.html#formatted-string-literals
-// https://docs.python.org/3.13/library/string.html#format-specification-mini-language
-// 'f' | 'F' | 'fr' | 'Fr' | 'fR' | 'FR' | 'rf' | 'rF' | 'Rf' | 'RF'
-fragment FSTRING_PREFIX options { caseInsensitive=true; } : 'f' | 'fr' | 'rf';
+// https://docs.python.org/3.14/reference/lexical_analysis.html#formatted-string-literals
+fragment FSTRING_PREFIX options { caseInsensitive=true; } : 'f' | 'fr' | 'rf'; // 'f' | 'F' | 'fr' | 'Fr' | 'fR' | 'FR' | 'rf' | 'rF' | 'Rf' | 'RF'
+fragment TSTRING_PREFIX options { caseInsensitive=true; } : 't' | 'tr' | 'rt'; // 't' | 'T' | 'tr' | 'Tr' | 'tR' | 'TR' | 'rt' | 'rT' | 'Rt' | 'RT'
+fragment STRING_QUOTES : [']
+ | ["]
+ | [']['][']
+ | ["]["]["]
+ ;
-fragment SQ1__FSTRING_ITEM : (SQ1__FSTRING_PART+ TERMINATING_FSTRING_MIDDLE?) | TERMINATING_FSTRING_MIDDLE;
-fragment DQ1__FSTRING_ITEM : (DQ1__FSTRING_PART+ TERMINATING_FSTRING_MIDDLE?) | TERMINATING_FSTRING_MIDDLE;
-fragment SQ3__FSTRING_ITEM : (SQ3__FSTRING_PART+ TERMINATING_SQ3__FSTRING_MIDDLE?) | TERMINATING_SQ3__FSTRING_MIDDLE;
-fragment DQ3__FSTRING_ITEM : (DQ3__FSTRING_PART+ TERMINATING_DQ3__FSTRING_MIDDLE?) | TERMINATING_DQ3__FSTRING_MIDDLE;
+fragment SQ1__ISTRING_ITEM : (SQ1__ISTRING_PART+ TERMINATING_ISTRING_MIDDLE?) | TERMINATING_ISTRING_MIDDLE;
+fragment DQ1__ISTRING_ITEM : (DQ1__ISTRING_PART+ TERMINATING_ISTRING_MIDDLE?) | TERMINATING_ISTRING_MIDDLE;
+fragment SQ3__ISTRING_ITEM : (SQ3__ISTRING_PART+ TERMINATING_SQ3__ISTRING_MIDDLE?) | TERMINATING_SQ3__ISTRING_MIDDLE;
+fragment DQ3__ISTRING_ITEM : (DQ3__ISTRING_PART+ TERMINATING_DQ3__ISTRING_MIDDLE?) | TERMINATING_DQ3__ISTRING_MIDDLE;
-fragment SQ1R_FSTRING_ITEM : (SQ1R_FSTRING_PART+ TERMINATING_FSTRING_MIDDLE_RAW?) | TERMINATING_FSTRING_MIDDLE_RAW;
-fragment DQ1R_FSTRING_ITEM : (DQ1R_FSTRING_PART+ TERMINATING_FSTRING_MIDDLE_RAW?) | TERMINATING_FSTRING_MIDDLE_RAW;
-fragment SQ3R_FSTRING_ITEM : (SQ3R_FSTRING_PART+ TERMINATING_SQ3R_FSTRING_MIDDLE?) | TERMINATING_SQ3R_FSTRING_MIDDLE;
-fragment DQ3R_FSTRING_ITEM : (DQ3R_FSTRING_PART+ TERMINATING_DQ3R_FSTRING_MIDDLE?) | TERMINATING_DQ3R_FSTRING_MIDDLE;
+fragment SQ1R_ISTRING_ITEM : (SQ1R_ISTRING_PART+ TERMINATING_ISTRING_MIDDLE_RAW?) | TERMINATING_ISTRING_MIDDLE_RAW;
+fragment DQ1R_ISTRING_ITEM : (DQ1R_ISTRING_PART+ TERMINATING_ISTRING_MIDDLE_RAW?) | TERMINATING_ISTRING_MIDDLE_RAW;
+fragment SQ3R_ISTRING_ITEM : (SQ3R_ISTRING_PART+ TERMINATING_SQ3R_ISTRING_MIDDLE?) | TERMINATING_SQ3R_ISTRING_MIDDLE;
+fragment DQ3R_ISTRING_ITEM : (DQ3R_ISTRING_PART+ TERMINATING_DQ3R_ISTRING_MIDDLE?) | TERMINATING_DQ3R_ISTRING_MIDDLE;
-fragment SQ1__FSTRING_PART : SQ1_FSTRING_CHAR | FSTRING_ESCAPE_SEQ;
-fragment DQ1__FSTRING_PART : DQ1_FSTRING_CHAR | FSTRING_ESCAPE_SEQ;
-fragment SQ3__FSTRING_PART : ONE_OR_TWO_SQUOTE? (SQ3_FSTRING_CHAR | FSTRING_ESCAPE_SEQ);
-fragment DQ3__FSTRING_PART : ONE_OR_TWO_DQUOTE? (DQ3_FSTRING_CHAR | FSTRING_ESCAPE_SEQ);
-fragment SQ1R_FSTRING_PART : SQ1_FSTRING_CHAR | FSTRING_ESCAPE_SEQ_RAW;
-fragment DQ1R_FSTRING_PART : DQ1_FSTRING_CHAR | FSTRING_ESCAPE_SEQ_RAW;
-fragment SQ3R_FSTRING_PART : ONE_OR_TWO_SQUOTE? (SQ3_FSTRING_CHAR | FSTRING_ESCAPE_SEQ_RAW);
-fragment DQ3R_FSTRING_PART : ONE_OR_TWO_DQUOTE? (DQ3_FSTRING_CHAR | FSTRING_ESCAPE_SEQ_RAW);
-fragment SQ1_FSTRING_CHAR : ~[\\{}'\r\n]; //
-fragment DQ1_FSTRING_CHAR : ~[\\{}"\r\n]; //
-fragment SQ3_FSTRING_CHAR : ~[\\{}']; //
-fragment DQ3_FSTRING_CHAR : ~[\\{}"]; //
+fragment SQ1__ISTRING_PART : SQ1_ISTRING_CHAR | ISTRING_ESCAPE_SEQ;
+fragment DQ1__ISTRING_PART : DQ1_ISTRING_CHAR | ISTRING_ESCAPE_SEQ;
+fragment SQ3__ISTRING_PART : ONE_OR_TWO_SQUOTE? (SQ3_ISTRING_CHAR | ISTRING_ESCAPE_SEQ);
+fragment DQ3__ISTRING_PART : ONE_OR_TWO_DQUOTE? (DQ3_ISTRING_CHAR | ISTRING_ESCAPE_SEQ);
-fragment TERMINATING_SQ3__FSTRING_MIDDLE : ONE_OR_TWO_SQUOTE '{' | ONE_OR_TWO_SQUOTE? TERMINATING_FSTRING_MIDDLE;
-fragment TERMINATING_DQ3__FSTRING_MIDDLE : ONE_OR_TWO_DQUOTE '{' | ONE_OR_TWO_DQUOTE? TERMINATING_FSTRING_MIDDLE;
-fragment TERMINATING_SQ3R_FSTRING_MIDDLE : ONE_OR_TWO_SQUOTE '{' | ONE_OR_TWO_SQUOTE? TERMINATING_FSTRING_MIDDLE_RAW;
-fragment TERMINATING_DQ3R_FSTRING_MIDDLE : ONE_OR_TWO_DQUOTE '{' | ONE_OR_TWO_DQUOTE? TERMINATING_FSTRING_MIDDLE_RAW;
+fragment SQ1R_ISTRING_PART : SQ1_ISTRING_CHAR | ISTRING_ESCAPE_SEQ_RAW;
+fragment DQ1R_ISTRING_PART : DQ1_ISTRING_CHAR | ISTRING_ESCAPE_SEQ_RAW;
+fragment SQ3R_ISTRING_PART : ONE_OR_TWO_SQUOTE? (SQ3_ISTRING_CHAR | ISTRING_ESCAPE_SEQ_RAW);
+fragment DQ3R_ISTRING_PART : ONE_OR_TWO_DQUOTE? (DQ3_ISTRING_CHAR | ISTRING_ESCAPE_SEQ_RAW);
-fragment TERMINATING_FSTRING_MIDDLE : '\\'? DOUBLE_BRACE | '\\{' | ESCAPE_SEQ_NAMED_CHAR;
-fragment TERMINATING_FSTRING_MIDDLE_RAW : '\\'? DOUBLE_BRACE | '\\{' ; // https://docs.python.org/3/faq/design.html#why-can-t-raw-strings-r-strings-end-with-a-backslash
-fragment FSTRING_ESCAPE_SEQ : ESCAPE_SEQ_NEWLINE | '\\' ~[{}N]; // f"\\}" causes a lexer error
-fragment FSTRING_ESCAPE_SEQ_RAW : ESCAPE_SEQ_NEWLINE | '\\' ~[{}]; // fr"\}" causes a lexer error
+
+fragment SQ1_ISTRING_CHAR : ~[\\{}'\r\n]; //
+fragment DQ1_ISTRING_CHAR : ~[\\{}"\r\n]; //
+fragment SQ3_ISTRING_CHAR : ~[\\{}']; //
+fragment DQ3_ISTRING_CHAR : ~[\\{}"]; //
+
+
+
+fragment TERMINATING_SQ3__ISTRING_MIDDLE : ONE_OR_TWO_SQUOTE_LBRACE | ONE_OR_TWO_SQUOTE? TERMINATING_ISTRING_MIDDLE;
+fragment TERMINATING_DQ3__ISTRING_MIDDLE : ONE_OR_TWO_DQUOTE_LBRACE | ONE_OR_TWO_DQUOTE? TERMINATING_ISTRING_MIDDLE;
+fragment TERMINATING_SQ3R_ISTRING_MIDDLE : ONE_OR_TWO_SQUOTE_LBRACE | ONE_OR_TWO_SQUOTE? TERMINATING_ISTRING_MIDDLE_RAW;
+fragment TERMINATING_DQ3R_ISTRING_MIDDLE : ONE_OR_TWO_DQUOTE_LBRACE | ONE_OR_TWO_DQUOTE? TERMINATING_ISTRING_MIDDLE_RAW;
+fragment ONE_OR_TWO_SQUOTE_LBRACE : ONE_OR_TWO_SQUOTE '{';
+fragment ONE_OR_TWO_DQUOTE_LBRACE : ONE_OR_TWO_DQUOTE '{';
+
+fragment TERMINATING_ISTRING_MIDDLE : TERMINATING_ISTRING_MIDDLE_RAW | ESCAPE_SEQ_NAMED_CHAR;
+fragment TERMINATING_ISTRING_MIDDLE_RAW : '\\'? DOUBLE_BRACE | '\\{' ; // https://docs.python.org/3/faq/design.html#why-can-t-raw-strings-r-strings-end-with-a-backslash
+
+fragment ISTRING_ESCAPE_SEQ : ESCAPE_SEQ_NEWLINE | '\\' ~[{}N]; // f"\\}" causes a lexer error
+fragment ISTRING_ESCAPE_SEQ_RAW : ESCAPE_SEQ_NEWLINE | '\\' ~[{}]; // fr"\}" causes a lexer error
fragment ONE_OR_TWO_SQUOTE : ['][']?;
fragment ONE_OR_TWO_DQUOTE : ["]["]?;
-fragment DOUBLE_BRACE : '{{' | '}}'; // will be replaced to single brace in PythonLexerBase class
-
-fragment ESCAPE_SEQ_NAMED_CHAR : '\\N{' .*? '}'; // an escape sequence for a character by a name from the Unicode database
-fragment ESCAPE_SEQ_NEWLINE : BACKSLASH_NEWLINE; // it is a kind of line continuation for string literals (backslash and newline will be ignored)
+fragment DOUBLE_BRACE : '{{' | '}}'; // PythonLexerBase replaces double brace with single brace
+fragment ESCAPE_SEQ_NAMED_CHAR : '\\N{' .*? '}'; // an escape sequence for a Unicode character specified by name
+fragment ESCAPE_SEQ_NEWLINE : BACKSLASH_NEWLINE; // this escape sequence acts as a line continuation in string literals
+ // the backslash and newline are ignored by the Python interpreter
fragment BACKSLASH_NEWLINE : '\\' NEWLINE;
-// https://docs.python.org/3.13/reference/lexical_analysis.html#integer-literals
+// https://docs.python.org/3.14/reference/lexical_analysis.html#integer-literals
fragment INTEGER : DEC_INTEGER | BIN_INTEGER | OCT_INTEGER | HEX_INTEGER;
fragment DEC_INTEGER : NON_ZERO_DIGIT ('_'? DIGIT)* | '0'+ ('_'? '0')*;
fragment BIN_INTEGER : '0' ('b' | 'B') ('_'? BIN_DIGIT)+;
@@ -406,7 +563,7 @@ fragment BIN_DIGIT : '0' | '1';
fragment OCT_DIGIT : [0-7];
fragment HEX_DIGIT : DIGIT | [a-f] | [A-F];
-// https://docs.python.org/3.13/reference/lexical_analysis.html#floating-point-literals
+// https://docs.python.org/3.14/reference/lexical_analysis.html#floating-point-literals
fragment FLOAT_NUMBER : POINT_FLOAT | EXPONENT_FLOAT;
fragment POINT_FLOAT : DIGIT_PART? FRACTION | DIGIT_PART '.';
fragment EXPONENT_FLOAT : (DIGIT_PART | POINT_FLOAT) EXPONENT;
@@ -414,11 +571,11 @@ fragment DIGIT_PART : DIGIT ('_'? DIGIT)*;
fragment FRACTION : '.' DIGIT_PART;
fragment EXPONENT : ('e' | 'E') ('+' | '-')? DIGIT_PART;
-// https://docs.python.org/3.13/reference/lexical_analysis.html#imaginary-literals
+// https://docs.python.org/3.14/reference/lexical_analysis.html#imaginary-literals
fragment IMAG_NUMBER : (FLOAT_NUMBER | DIGIT_PART) ('j' | 'J');
-// https://github.com/RobEin/ANTLR4-parser-for-Python-3.13/tree/main/valid_chars_in_py_identifiers
-fragment ID_CONTINUE
+// https://github.com/RobEin/ANTLR4-parser-for-Python-3.14/tree/main/utils/valid_chars_in_py_identifiers
+fragment ID_CONTINUE // for Python 3.14.2
: ID_START
| '\u{0030}' .. '\u{0039}'
| '\u{00B7}'
@@ -449,7 +606,7 @@ fragment ID_CONTINUE
| '\u{0825}' .. '\u{0827}'
| '\u{0829}' .. '\u{082D}'
| '\u{0859}' .. '\u{085B}'
- | '\u{0898}' .. '\u{089F}'
+ | '\u{0897}' .. '\u{089F}'
| '\u{08CA}' .. '\u{08E1}'
| '\u{08E3}' .. '\u{0903}'
| '\u{093A}' .. '\u{093C}'
@@ -666,8 +823,10 @@ fragment ID_CONTINUE
| '\u{10AE5}' .. '\u{10AE6}'
| '\u{10D24}' .. '\u{10D27}'
| '\u{10D30}' .. '\u{10D39}'
+ | '\u{10D40}' .. '\u{10D49}'
+ | '\u{10D69}' .. '\u{10D6D}'
| '\u{10EAB}' .. '\u{10EAC}'
- | '\u{10EFD}' .. '\u{10EFF}'
+ | '\u{10EFC}' .. '\u{10EFF}'
| '\u{10F46}' .. '\u{10F50}'
| '\u{10F82}' .. '\u{10F85}'
| '\u{11000}' .. '\u{11002}'
@@ -701,6 +860,13 @@ fragment ID_CONTINUE
| '\u{11362}' .. '\u{11363}'
| '\u{11366}' .. '\u{1136C}'
| '\u{11370}' .. '\u{11374}'
+ | '\u{113B8}' .. '\u{113C0}'
+ | '\u{113C2}'
+ | '\u{113C5}'
+ | '\u{113C7}' .. '\u{113CA}'
+ | '\u{113CC}' .. '\u{113D0}'
+ | '\u{113D2}'
+ | '\u{113E1}' .. '\u{113E2}'
| '\u{11435}' .. '\u{11446}'
| '\u{11450}' .. '\u{11459}'
| '\u{1145E}'
@@ -713,6 +879,7 @@ fragment ID_CONTINUE
| '\u{11650}' .. '\u{11659}'
| '\u{116AB}' .. '\u{116B7}'
| '\u{116C0}' .. '\u{116C9}'
+ | '\u{116D0}' .. '\u{116E3}'
| '\u{1171D}' .. '\u{1172B}'
| '\u{11730}' .. '\u{11739}'
| '\u{1182C}' .. '\u{1183A}'
@@ -732,6 +899,7 @@ fragment ID_CONTINUE
| '\u{11A47}'
| '\u{11A51}' .. '\u{11A5B}'
| '\u{11A8A}' .. '\u{11A99}'
+ | '\u{11BF0}' .. '\u{11BF9}'
| '\u{11C2F}' .. '\u{11C36}'
| '\u{11C38}' .. '\u{11C3F}'
| '\u{11C50}' .. '\u{11C59}'
@@ -752,20 +920,23 @@ fragment ID_CONTINUE
| '\u{11F03}'
| '\u{11F34}' .. '\u{11F3A}'
| '\u{11F3E}' .. '\u{11F42}'
- | '\u{11F50}' .. '\u{11F59}'
+ | '\u{11F50}' .. '\u{11F5A}'
| '\u{13440}'
| '\u{13447}' .. '\u{13455}'
+ | '\u{1611E}' .. '\u{16139}'
| '\u{16A60}' .. '\u{16A69}'
| '\u{16AC0}' .. '\u{16AC9}'
| '\u{16AF0}' .. '\u{16AF4}'
| '\u{16B30}' .. '\u{16B36}'
| '\u{16B50}' .. '\u{16B59}'
+ | '\u{16D70}' .. '\u{16D79}'
| '\u{16F4F}'
| '\u{16F51}' .. '\u{16F87}'
| '\u{16F8F}' .. '\u{16F92}'
| '\u{16FE4}'
| '\u{16FF0}' .. '\u{16FF1}'
| '\u{1BC9D}' .. '\u{1BC9E}'
+ | '\u{1CCF0}' .. '\u{1CCF9}'
| '\u{1CF00}' .. '\u{1CF2D}'
| '\u{1CF30}' .. '\u{1CF46}'
| '\u{1D165}' .. '\u{1D169}'
@@ -792,6 +963,8 @@ fragment ID_CONTINUE
| '\u{1E2AE}'
| '\u{1E2EC}' .. '\u{1E2F9}'
| '\u{1E4EC}' .. '\u{1E4F9}'
+ | '\u{1E5EE}' .. '\u{1E5EF}'
+ | '\u{1E5F1}' .. '\u{1E5FA}'
| '\u{1E8D0}' .. '\u{1E8D6}'
| '\u{1E944}' .. '\u{1E94A}'
| '\u{1E950}' .. '\u{1E959}'
@@ -799,7 +972,7 @@ fragment ID_CONTINUE
| '\u{E0100}' .. '\u{E01EF}'
;
-fragment ID_START
+fragment ID_START // for Python 3.14.2
: '\u{0041}' .. '\u{005A}'
| '\u{005F}'
| '\u{0061}' .. '\u{007A}'
@@ -1025,7 +1198,7 @@ fragment ID_START
| '\u{1C00}' .. '\u{1C23}'
| '\u{1C4D}' .. '\u{1C4F}'
| '\u{1C5A}' .. '\u{1C7D}'
- | '\u{1C80}' .. '\u{1C88}'
+ | '\u{1C80}' .. '\u{1C8A}'
| '\u{1C90}' .. '\u{1CBA}'
| '\u{1CBD}' .. '\u{1CBF}'
| '\u{1CE9}' .. '\u{1CEC}'
@@ -1108,10 +1281,10 @@ fragment ID_START
| '\u{A6A0}' .. '\u{A6EF}'
| '\u{A717}' .. '\u{A71F}'
| '\u{A722}' .. '\u{A788}'
- | '\u{A78B}' .. '\u{A7CA}'
+ | '\u{A78B}' .. '\u{A7CD}'
| '\u{A7D0}' .. '\u{A7D1}'
| '\u{A7D3}'
- | '\u{A7D5}' .. '\u{A7D9}'
+ | '\u{A7D5}' .. '\u{A7DC}'
| '\u{A7F2}' .. '\u{A801}'
| '\u{A803}' .. '\u{A805}'
| '\u{A807}' .. '\u{A80A}'
@@ -1216,6 +1389,7 @@ fragment ID_START
| '\u{105A3}' .. '\u{105B1}'
| '\u{105B3}' .. '\u{105B9}'
| '\u{105BB}' .. '\u{105BC}'
+ | '\u{105C0}' .. '\u{105F3}'
| '\u{10600}' .. '\u{10736}'
| '\u{10740}' .. '\u{10755}'
| '\u{10760}' .. '\u{10767}'
@@ -1252,8 +1426,11 @@ fragment ID_START
| '\u{10C80}' .. '\u{10CB2}'
| '\u{10CC0}' .. '\u{10CF2}'
| '\u{10D00}' .. '\u{10D23}'
+ | '\u{10D4A}' .. '\u{10D65}'
+ | '\u{10D6F}' .. '\u{10D85}'
| '\u{10E80}' .. '\u{10EA9}'
| '\u{10EB0}' .. '\u{10EB1}'
+ | '\u{10EC2}' .. '\u{10EC4}'
| '\u{10F00}' .. '\u{10F1C}'
| '\u{10F27}'
| '\u{10F30}' .. '\u{10F45}'
@@ -1292,6 +1469,13 @@ fragment ID_START
| '\u{1133D}'
| '\u{11350}'
| '\u{1135D}' .. '\u{11361}'
+ | '\u{11380}' .. '\u{11389}'
+ | '\u{1138B}'
+ | '\u{1138E}'
+ | '\u{11390}' .. '\u{113B5}'
+ | '\u{113B7}'
+ | '\u{113D1}'
+ | '\u{113D3}'
| '\u{11400}' .. '\u{11434}'
| '\u{11447}' .. '\u{1144A}'
| '\u{1145F}' .. '\u{11461}'
@@ -1326,6 +1510,7 @@ fragment ID_START
| '\u{11A5C}' .. '\u{11A89}'
| '\u{11A9D}'
| '\u{11AB0}' .. '\u{11AF8}'
+ | '\u{11BC0}' .. '\u{11BE0}'
| '\u{11C00}' .. '\u{11C08}'
| '\u{11C0A}' .. '\u{11C2E}'
| '\u{11C40}'
@@ -1349,7 +1534,9 @@ fragment ID_START
| '\u{12F90}' .. '\u{12FF0}'
| '\u{13000}' .. '\u{1342F}'
| '\u{13441}' .. '\u{13446}'
+ | '\u{13460}' .. '\u{143FA}'
| '\u{14400}' .. '\u{14646}'
+ | '\u{16100}' .. '\u{1611D}'
| '\u{16800}' .. '\u{16A38}'
| '\u{16A40}' .. '\u{16A5E}'
| '\u{16A70}' .. '\u{16ABE}'
@@ -1358,6 +1545,7 @@ fragment ID_START
| '\u{16B40}' .. '\u{16B43}'
| '\u{16B63}' .. '\u{16B77}'
| '\u{16B7D}' .. '\u{16B8F}'
+ | '\u{16D40}' .. '\u{16D6C}'
| '\u{16E40}' .. '\u{16E7F}'
| '\u{16F00}' .. '\u{16F4A}'
| '\u{16F50}'
@@ -1366,7 +1554,7 @@ fragment ID_START
| '\u{16FE3}'
| '\u{17000}' .. '\u{187F7}'
| '\u{18800}' .. '\u{18CD5}'
- | '\u{18D00}' .. '\u{18D08}'
+ | '\u{18CFF}' .. '\u{18D08}'
| '\u{1AFF0}' .. '\u{1AFF3}'
| '\u{1AFF5}' .. '\u{1AFFB}'
| '\u{1AFFD}' .. '\u{1AFFE}'
@@ -1419,6 +1607,8 @@ fragment ID_START
| '\u{1E290}' .. '\u{1E2AD}'
| '\u{1E2C0}' .. '\u{1E2EB}'
| '\u{1E4D0}' .. '\u{1E4EB}'
+ | '\u{1E5D0}' .. '\u{1E5ED}'
+ | '\u{1E5F0}'
| '\u{1E7E0}' .. '\u{1E7E6}'
| '\u{1E7E8}' .. '\u{1E7EB}'
| '\u{1E7ED}' .. '\u{1E7EE}'
@@ -1468,4 +1658,4 @@ fragment ID_START
| '\u{2F800}' .. '\u{2FA1D}'
| '\u{30000}' .. '\u{3134A}'
| '\u{31350}' .. '\u{323AF}'
- ;
+ ;
\ No newline at end of file
diff --git a/python/python3_13/PythonParser.g4 b/python/python3_14/PythonParser.g4
similarity index 93%
rename from python/python3_13/PythonParser.g4
rename to python/python3_14/PythonParser.g4
index 4a1059a638..a8b2f3e3a9 100644
--- a/python/python3_13/PythonParser.g4
+++ b/python/python3_14/PythonParser.g4
@@ -21,17 +21,13 @@ THE SOFTWARE.
*/
/*
- * Project : an ANTLR4 parser grammar by the official PEG grammar
- * https://github.com/RobEin/ANTLR4-parser-for-Python-3.13
+ * Project : an ANTLR4 parser grammar for Python 3 programming language based on the official PEG grammar
+ * https://github.com/RobEin/ANTLR4-parser-for-Python-3.14
* Developed by : Robert Einhorn
*
*/
- /*
- * Contributors : [Willie Shen](https://github.com/Willie169)
- */
-
-// Python 3.13.2 https://docs.python.org/3.13/reference/grammar.html#full-grammar-specification
+// Python 3.14.2 https://docs.python.org/3.14/reference/grammar.html#full-grammar-specification
parser grammar PythonParser;
@@ -50,10 +46,15 @@ func_type: '(' type_expressions? ')' '->' expression NEWLINE* EOF;
statements: statement+;
-statement: compound_stmt | simple_stmts;
+statement
+ : compound_stmt
+ | simple_stmts;
+
+single_compound_stmt
+ : compound_stmt;
statement_newline
- : compound_stmt NEWLINE
+ : single_compound_stmt NEWLINE
| simple_stmts
| NEWLINE
| EOF;
@@ -71,12 +72,12 @@ simple_stmt
| return_stmt
| import_stmt
| raise_stmt
- | 'pass'
+ | pass_stmt
| del_stmt
| yield_stmt
| assert_stmt
- | 'break'
- | 'continue'
+ | break_stmt
+ | continue_stmt
| global_stmt
| nonlocal_stmt;
@@ -98,8 +99,8 @@ assignment
: name ':' expression ('=' annotated_rhs )?
| ('(' single_target ')'
| single_subscript_attribute_target) ':' expression ('=' annotated_rhs )?
- | (star_targets '=' )+ (yield_expr | star_expressions) TYPE_COMMENT?
- | single_target augassign (yield_expr | star_expressions);
+ | (star_targets '=' )+ annotated_rhs TYPE_COMMENT?
+ | single_target augassign annotated_rhs;
annotated_rhs: yield_expr | star_expressions;
@@ -125,6 +126,15 @@ raise_stmt
: 'raise' (expression ('from' expression )?)?
;
+pass_stmt
+ : 'pass';
+
+break_stmt
+ : 'break';
+
+continue_stmt
+ : 'continue';
+
global_stmt: 'global' name (',' name)*;
nonlocal_stmt: 'nonlocal' name (',' name)*;
@@ -156,10 +166,12 @@ import_from_as_names
: import_from_as_name (',' import_from_as_name)*;
import_from_as_name
: name ('as' name )?;
+
dotted_as_names
: dotted_as_name (',' dotted_as_name)*;
dotted_as_name
: dotted_name ('as' name )?;
+
dotted_name
: dotted_name '.' name
| name;
@@ -311,10 +323,14 @@ try_stmt
// ----------------
except_block
- : 'except' (expression ('as' name )?)? ':' block
+ : 'except' (expression ('as' name )? | expressions)? ':' block
;
+
+
except_star_block
- : 'except' '*' expression ('as' name )? ':' block;
+ : 'except' '*' (expression ('as' name )? | expressions) ':' block
+ ;
+
finally_block
: 'finally' ':' block;
@@ -472,7 +488,8 @@ type_alias
// Type parameter declaration
// --------------------------
-type_params: '[' type_param_seq ']';
+type_params
+ : '[' type_param_seq ']';
type_param_seq: type_param (',' type_param)* ','?;
@@ -481,8 +498,6 @@ type_param
| '*' name type_param_starred_default?
| '**' name type_param_default?
;
-
-
type_param_bound: ':' expression;
type_param_default: '=' expression;
type_param_starred_default: '=' star_expression;
@@ -719,8 +734,25 @@ fstring_format_spec
fstring
: FSTRING_START fstring_middle* FSTRING_END;
+
+
+tstring_format_spec
+ : TSTRING_MIDDLE
+ | tstring_replacement_field;
+tstring_full_format_spec
+ : ':' tstring_format_spec*;
+tstring_replacement_field
+ : LBRACE annotated_rhs '='? fstring_conversion? tstring_full_format_spec? RBRACE;
+tstring_middle
+ : tstring_replacement_field
+ | TSTRING_MIDDLE;
+tstring
+ : TSTRING_START tstring_middle* TSTRING_END;
+
string: STRING;
-strings: (fstring|string)+;
+strings
+ : (fstring|string)+
+ |tstring+;
list
: '[' star_named_expressions? ']';
@@ -875,7 +907,7 @@ func_type_comment
: NEWLINE TYPE_COMMENT // Must be followed by indented block
| TYPE_COMMENT;
-// *** related to soft keywords: https://docs.python.org/3.13/reference/lexical_analysis.html#soft-keywords
+// *** related to soft keywords: https://docs.python.org/3.14/reference/lexical_analysis.html#soft-keywords
name_except_underscore
: NAME // ***** The NAME token can be used only in this rule *****
| NAME_OR_TYPE
diff --git a/python/python3_14/README.md b/python/python3_14/README.md
new file mode 100644
index 0000000000..50cc7d1725
--- /dev/null
+++ b/python/python3_14/README.md
@@ -0,0 +1,21 @@
+# Python 3.14.2 parser
+
+### About files:
+- PythonParser.g4 is the ANTLR4 parser grammar that based on the official [Python PEG grammar](https://docs.python.org/3.14/reference/grammar.html)
+
+- PythonLexerBase class
+ - handles the Python indentations
+ - creates encoding token
+ - tokenizes fstring literals
+ - and manage many other things
+
+- Example files from: [Python 3.14 Standard Lib](https://github.com/python/cpython/tree/3.14/Lib)
+
+### Recent changes:
+- parser grammar update for Python 3.14.2
+- tokenizing t-string literals
+- tokenizing BOM Unicode character at the start of the file so it is skipped in the token stream
+- moved encoding detection from PythonLexerBase to a separate component
+
+### Related link:
+[ANTLR4-parser-for-Python-3.14](https://github.com/RobEin/ANTLR4-parser-for-Python-3.14)
\ No newline at end of file
diff --git a/python/python3_14/TypeScript/PythonLexerBase.ts b/python/python3_14/TypeScript/PythonLexerBase.ts
new file mode 100644
index 0000000000..6b112b3409
--- /dev/null
+++ b/python/python3_14/TypeScript/PythonLexerBase.ts
@@ -0,0 +1,779 @@
+/*
+The MIT License (MIT)
+Copyright (c) 2021 Robert Einhorn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+ */
+
+/*
+ *
+ * Project : A helper class for an ANTLR4 Python lexer grammar that assists in tokenizing indentation,
+ * interpolated strings, and encoding declaration.
+ *
+ * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com
+ *
+ */
+
+import { CharStream, CharStreams, CommonTokenStream, Token, CommonToken, Lexer } from "antlr4";
+import { TokenSource } from "antlr4/src/antlr4/TokenSource.js";
+import PythonLexer from "./PythonLexer.js";
+import PythonParser from "./PythonParser.js";
+import * as Collections from "typescript-collections";
+
+export default abstract class PythonLexerBase extends Lexer {
+ private static readonly LEXER_MODES_FOR_ISTRING_START: Map = new Map();
+ private static readonly INVALID_LENGTH: number = -1;
+ private static readonly ERR_TXT: string = " ERROR: ";
+ private static readonly TAB_LENGTH: number = 8;
+
+ private encodingName!: string;
+
+ // Indentation handling
+ private indentLengthStack!: Collections.Stack;
+ private pendingTokens!: Array;
+
+ private previousPendingTokenType!: number;
+ private lastPendingTokenTypeFromDefaultChannel!: number;
+
+ // Parenthesis / bracket / brace counts
+ private opened!: number;
+ private paren_or_bracket_openedStack!: Array;
+ private braceExpressionStack!: Array;
+ private prevBraceExpression!: string;
+
+ // Current interpolated STRING_MIDDLE token type (FSTRING_MIDDLE or TSTRING_MIDDLE)
+ private curISTRING_MIDDLEtokenType!: number;
+
+ // We reimplement mode/stack because not all runtimes expose _mode/_modeStack
+ private curLexerMode!: number;
+ private lexerModeStack!: Array;
+
+ // Indentation diagnostics
+ private wasSpaceIndentation!: boolean;
+ private wasTabIndentation!: boolean;
+ private wasIndentationMixedWithSpacesAndTabs!: boolean;
+
+ // Current / lookahead tokens
+ private curToken: Token | undefined;
+ private ffgToken: Token | undefined;
+
+ protected constructor(input: CharStream) {
+ super(input);
+ this.init();
+ }
+
+ public reset(): void {
+ this.init();
+ super.reset();
+ }
+
+ private init(): void {
+ this.encodingName = "";
+ this.indentLengthStack = new Collections.Stack();
+ this.pendingTokens = [];
+ this.previousPendingTokenType = 0;
+ this.lastPendingTokenTypeFromDefaultChannel = 0;
+ this.opened = 0;
+ this.paren_or_bracket_openedStack = [];
+ this.braceExpressionStack = [];
+ this.prevBraceExpression = "";
+ this.curISTRING_MIDDLEtokenType = 0;
+ this.curLexerMode = Lexer.DEFAULT_MODE;
+ this.lexerModeStack = [];
+ this.wasSpaceIndentation = false;
+ this.wasTabIndentation = false;
+ this.wasIndentationMixedWithSpacesAndTabs = false;
+ this.curToken = undefined;
+ this.ffgToken = undefined;
+ }
+
+ /**
+ * Sets the encoding name to emit an ENCODING token at the start of the token stream.
+ * Leave empty if not needed (e.g., when parsing from string).
+ *
+ * @param encodingName - The encoding name (e.g., "utf-8"), or empty string to disable ENCODING token.
+ */
+ public setEncodingName(encodingName: string): void {
+ this.encodingName = encodingName;
+ }
+
+ public nextToken(): Token { // Reading the input stream until EOF is reached
+ this.checkNextToken();
+ return this.pendingTokens.shift()!; /* .pollFirst() */; // Add the queued token to the token stream
+ }
+
+ private checkNextToken(): void {
+ if (this.previousPendingTokenType == PythonLexer.EOF)
+ return;
+
+ this.setCurrentAndFollowingTokens();
+ if (this.indentLengthStack.isEmpty()) { // We're at the first token
+ this.handleStartOfInput();
+ }
+
+ switch (this.curToken!.type) {
+ case PythonLexer.NEWLINE:
+ this.handleNEWLINEtoken();
+ break;
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ case PythonLexer.LBRACE:
+ this.opened++;
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ case PythonLexer.RBRACE:
+ this.opened--;
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ this.handleISTRING_MIDDLEtokenWithDoubleBrace(); // does not affect the opened field
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.COLONEQUAL:
+ this.handleCOLONEQUALtokenInIString();
+ break;
+ case PythonLexer.ERRORTOKEN:
+ this.reportLexerError(`token recognition error at: '${this.curToken!.text}'`);
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.EOF:
+ this.handleEOFtoken();
+ break;
+ default:
+ this.addPendingToken(this.curToken!);
+ }
+ this.handleFORMAT_SPECIFICATION_MODE();
+ }
+
+ private setCurrentAndFollowingTokens(): void {
+ this.curToken = this.ffgToken == undefined
+ ? super.nextToken()
+ : this.ffgToken;
+
+ this.checkCurToken(); // Do not use ffgToken in this method or any of its submethods — it hasn't been set yet!
+
+ this.ffgToken = this.curToken!.type === PythonLexer.EOF
+ ? this.curToken
+ : super.nextToken();
+ }
+
+ // - initialize indent stack
+ // - skip BOM token
+ // - insert ENCODING token (if any)
+ // - hide leading NEWLINE(s)
+ // - insert leading INDENT if first statement is indented
+ private handleStartOfInput(): void {
+ // initialize the stack with a default 0 indentation length
+ this.indentLengthStack.push(0); // this will never be popped off
+
+ if (this.curToken!.type === PythonLexer.BOM) {
+ this.setCurrentAndFollowingTokens();
+ }
+ this.insertENCODINGtoken();
+
+ while (this.curToken!.type !== PythonLexer.EOF) {
+ if (this.curToken!.channel === Token.DEFAULT_CHANNEL) {
+ if (this.curToken!.type === PythonLexer.NEWLINE) {
+ // all the NEWLINE tokens must be ignored before the first statement
+ this.hideAndAddPendingToken(this.curToken!);
+ } else { // We're at the first statement
+ this.insertLeadingIndentToken();
+ return; // continue the processing of the current token with checkNextToken()
+ }
+ } else {
+ this.addPendingToken(this.curToken!); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ }
+ this.setCurrentAndFollowingTokens();
+ } // continue the processing of the EOF token with checkNextToken()
+ }
+
+ private insertENCODINGtoken(): void { // https://peps.python.org/pep-0263/
+ if (this.encodingName === '') return;
+
+ const sourcePair = [this as unknown as TokenSource, this._input] as [TokenSource, CharStream];
+ const encodingToken: CommonToken = new CommonToken(sourcePair, PythonLexer.ENCODING, Token.HIDDEN_CHANNEL, /*start*/ 0, /*stop*/ 0);
+ encodingToken.text = this.encodingName;
+ encodingToken.line = 0;
+ encodingToken.column = -1;
+ this.addPendingToken(encodingToken);
+ }
+
+ private insertLeadingIndentToken(): void {
+ if (this.previousPendingTokenType === PythonLexer.WS) {
+ const prevToken: Token = this.pendingTokens.at(-1)!; /* stack peek */ // WS token
+ if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
+ const errMsg: string = "first statement indented";
+ this.reportLexerError(errMsg);
+ // insert an INDENT token before the first statement to trigger an 'unexpected indent' error later in the parser
+ this.createAndAddPendingToken(PythonLexer.INDENT, PythonLexerBase.ERR_TXT + errMsg, this.curToken!);
+ }
+ }
+ }
+
+ private handleNEWLINEtoken(): void {
+ if (this.lexerModeStack.length > 0) { // for multi line f/t-string literals
+ this.addPendingToken(this.curToken!);
+ return;
+ }
+
+ if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
+ this.hideAndAddPendingToken(this.curToken!);
+ return;
+ }
+
+ const nlToken: Token = this.curToken!.clone(); // save the current NEWLINE token
+ const isLookingAhead: boolean = this.ffgToken!.type === PythonLexer.WS;
+ if (isLookingAhead) {
+ this.setCurrentAndFollowingTokens(); // set the next two tokens
+ }
+
+ switch (this.ffgToken!.type) {
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
+ this.hideAndAddPendingToken(nlToken);
+ if (isLookingAhead) {
+ this.addPendingToken(this.curToken!); // WS token
+ }
+ break;
+ default:
+ this.addPendingToken(nlToken);
+ if (isLookingAhead) { // We're on whitespace(s) followed by a statement
+ const indentationLength: number = this.ffgToken!.type === PythonLexer.EOF ?
+ 0 :
+ this.getIndentationLength(this.curToken!.text);
+
+ if (indentationLength !== PythonLexerBase.INVALID_LENGTH) {
+ this.addPendingToken(this.curToken!); // WS token
+ this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
+ } else {
+ this.reportError("inconsistent use of tabs and spaces in indentation");
+ }
+ } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
+ this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
+ }
+ }
+ }
+
+ private insertIndentOrDedentToken(indentLength: number): void {
+ let prevIndentLength: number = this.indentLengthStack.peek()!;
+ if (indentLength > prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.INDENT, null, this.ffgToken!);
+ this.indentLengthStack.push(indentLength);
+ return;
+ }
+
+ while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
+ this.indentLengthStack.pop();
+ prevIndentLength = this.indentLengthStack.peek()!;
+ if (indentLength <= prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.DEDENT, null, this.ffgToken!);
+ } else {
+ this.reportError("inconsistent dedent");
+ }
+ }
+ }
+
+ private checkCurToken(): void {
+ switch (this.curToken!.type) {
+ case PythonLexer.FSTRING_START:
+ this.curISTRING_MIDDLEtokenType = PythonLexer.FSTRING_MIDDLE;
+ this.setLexerModeByISTRING_STARTtoken();
+ return;
+ case PythonLexer.TSTRING_START:
+ this.curISTRING_MIDDLEtokenType = PythonLexer.TSTRING_MIDDLE;
+ this.setLexerModeByISTRING_STARTtoken();
+ return;
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ this.handleISTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field
+ switch (this.curToken!.type) {
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE:
+ return; // No curToken exchange happened
+ }
+ break;
+ case PythonLexer.FSTRING_END:
+ case PythonLexer.TSTRING_END:
+ this.popLexerMode();
+ return;
+ default:
+ if (this.lexerModeStack.length === 0) {
+ return; // Not in f/t-string mode
+ }
+ }
+ this.processBraceExpression();
+ }
+
+ private processBraceExpression(): void {
+ switch (this.curToken!.type) { // the following tokens can only come from default mode (after an LBRACE in f/t-string)
+ case PythonLexer.NEWLINE:
+ // append the current brace expression with the current newline
+ this.appendToBraceExpression(this.curToken!.text);
+ this.curToken!.channel = Token.HIDDEN_CHANNEL;
+ break;
+ case PythonLexer.LBRACE:
+ // the outermost brace expression cannot be a dictionary comprehension or a set comprehension
+ this.braceExpressionStack.push("{");
+ this.paren_or_bracket_openedStack.push(0);
+ this.pushLexerMode(Lexer.DEFAULT_MODE);
+ break;
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ // append the current brace expression with a "(" or a "["
+ this.appendToBraceExpression(this.curToken!.text);
+ // https://peps.python.org/pep-0498/#lambdas-inside-expressions
+ this.incrementBraceStack();
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ // append the current brace expression with a ")" or a "]"
+ this.appendToBraceExpression(this.curToken!.text);
+ this.decrementBraceStack();
+ break;
+ case PythonLexer.COLON:
+ case PythonLexer.COLONEQUAL:
+ // append the current brace expression with a ":" or a ":="
+ this.appendToBraceExpression(this.curToken!.text);
+ this.setLexerModeByCOLONorCOLONEQUALtoken();
+ break;
+ case PythonLexer.RBRACE:
+ this.setLexerModeAfterRBRACEtoken();
+ break;
+ default:
+ // append the current brace expression with the current token text
+ this.appendToBraceExpression(this.curToken!.text);
+ }
+ }
+
+ private appendToBraceExpression(text: string): void {
+ const lastIndex: number = this.braceExpressionStack.length - 1;
+ this.braceExpressionStack[lastIndex] += text;
+ }
+
+ private incrementBraceStack(): void { // increment the last element (stack peek + 1)
+ const lastIndex: number = this.paren_or_bracket_openedStack.length - 1;
+ this.paren_or_bracket_openedStack[lastIndex]!++;
+ }
+
+ private decrementBraceStack(): void { // decrement the last element (stack peek - 1)
+ const lastIndex: number = this.paren_or_bracket_openedStack.length - 1;
+ this.paren_or_bracket_openedStack[lastIndex]!--;
+ }
+
+ private setLexerModeAfterRBRACEtoken(): void {
+ switch (this.curLexerMode) {
+ case Lexer.DEFAULT_MODE:
+ this.popLexerMode();
+ this.popByBRACE();
+ break;
+ case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.popLexerMode();
+ this.popLexerMode();
+ this.popByBRACE();
+ break;
+ default:
+ this.reportLexerError("f-string: single '}' is not allowed");
+ }
+ }
+
+ private setLexerModeByISTRING_STARTtoken(): void { // ISTRING = interpolated string (FSTRING or TSTRING)
+ if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.size === 0) {
+ PythonLexerBase.initLexerModesForIStringStart();
+ }
+
+ const interpolatedStringPrefix: string = this.curToken!.text.toLowerCase();
+ if (PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.has(interpolatedStringPrefix)) {
+ const newLexerMode: number = PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.get(interpolatedStringPrefix)!;
+ this.pushLexerMode(newLexerMode);
+ } else {
+ this.reportLexerError(
+ "internal error: unknown interpolated string literal prefix: " + this.curToken!.text
+ );
+ }
+ }
+
+ private static initLexerModesForIStringStart(): void {
+ // f-strings
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("f'", PythonLexer.SQ1__FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rf'", PythonLexer.SQ1R_FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("fr'", PythonLexer.SQ1R_FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("f\"", PythonLexer.DQ1__FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rf\"", PythonLexer.DQ1R_FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("fr\"", PythonLexer.DQ1R_FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("f'''", PythonLexer.SQ3__FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rf'''", PythonLexer.SQ3R_FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("fr'''", PythonLexer.SQ3R_FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("f\"\"\"", PythonLexer.DQ3__FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rf\"\"\"", PythonLexer.DQ3R_FSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("fr\"\"\"", PythonLexer.DQ3R_FSTRING_MODE);
+
+ // t-strings
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("t'", PythonLexer.SQ1__TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rt'", PythonLexer.SQ1R_TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("tr'", PythonLexer.SQ1R_TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("t\"", PythonLexer.DQ1__TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rt\"", PythonLexer.DQ1R_TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("tr\"", PythonLexer.DQ1R_TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("t'''", PythonLexer.SQ3__TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rt'''", PythonLexer.SQ3R_TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("tr'''", PythonLexer.SQ3R_TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("t\"\"\"", PythonLexer.DQ3__TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("rt\"\"\"", PythonLexer.DQ3R_TSTRING_MODE);
+ PythonLexerBase.LEXER_MODES_FOR_ISTRING_START.set("tr\"\"\"", PythonLexer.DQ3R_TSTRING_MODE);
+ }
+
+ private setLexerModeByCOLONorCOLONEQUALtoken(): void {
+ // Exit early when the current lexer mode indicates an open parenthesis/bracket
+ const opened: boolean = this.paren_or_bracket_openedStack.at(-1)! > 0; /* stack peek */
+ if (opened) {
+ return;
+ }
+
+ // COLONEQUAL token will be replaced with a COLON token in CheckNextToken()
+ const prevLexerMode: number = this.lexerModeStack.at(-1)!; /* stack peek */
+ switch (prevLexerMode) {
+ case PythonLexer.SQ1__FSTRING_MODE:
+ case PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ1__FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ1__TSTRING_MODE:
+ case PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ1__TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ1R_FSTRING_MODE:
+ case PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ1R_FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ1R_TSTRING_MODE:
+ case PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ1R_TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ1__FSTRING_MODE:
+ case PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ1__FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ1__TSTRING_MODE:
+ case PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ1__TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ1R_FSTRING_MODE:
+ case PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ1R_FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ1R_TSTRING_MODE:
+ case PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ1R_TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ3__FSTRING_MODE:
+ case PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ3__FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ3__TSTRING_MODE:
+ case PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ3__TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ3R_FSTRING_MODE:
+ case PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ3R_FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.SQ3R_TSTRING_MODE:
+ case PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.SQ3R_TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ3__FSTRING_MODE:
+ case PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ3__FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ3__TSTRING_MODE:
+ case PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ3__TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ3R_FSTRING_MODE:
+ case PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ3R_FSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+
+ case PythonLexer.DQ3R_TSTRING_MODE:
+ case PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE:
+ this.pushLexerMode(PythonLexer.DQ3R_TSTRING_FORMAT_SPECIFICATION_MODE);
+ break;
+ }
+ }
+
+ private popByBRACE(): void {
+ this.paren_or_bracket_openedStack.pop();
+ const curBraceExpression: string = this.braceExpressionStack.pop()!;
+ this.prevBraceExpression = curBraceExpression + "}";
+ if (this.braceExpressionStack.length > 0) {
+ // Extend the current brace expression by adding the previous expression
+ const lastIndex: number = this.braceExpressionStack.length - 1;
+ this.braceExpressionStack[lastIndex] += this.prevBraceExpression;
+ }
+ }
+
+ private handleISTRING_MIDDLEtokenWithDoubleBrace(): void { // ISTRING = interpolated string (FSTRING or TSTRING)
+ // Replace the trailing double brace with a single brace and insert a hidden brace token
+ const lastTwoChars: string = this.getLastTwoCharsOfTheCurTokenText();
+ switch (lastTwoChars) {
+ case "{{":
+ this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.HIDDEN_CHANNEL);
+ break;
+ case "}}":
+ this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.RBRACE, "}", Token.HIDDEN_CHANNEL);
+ break;
+ }
+ }
+
+ private handleISTRING_MIDDLEtokenWithQuoteAndLBrace(): void { // ISTRING = interpolated string (FSTRING or TSTRING)
+ // Replace the trailing quote + left_brace with a quote and insert an LBRACE token
+ // Replace the trailing backslash + left_brace with a backslash and insert an LBRACE token
+ const lastTwoChars: string = this.getLastTwoCharsOfTheCurTokenText();
+ switch (lastTwoChars) {
+ case "\"{":
+ case "'{":
+ case "\\{":
+ this.trimLastCharAddPendingTokenSetCurToken(PythonLexer.LBRACE, "{", Token.DEFAULT_CHANNEL);
+ break;
+ }
+ }
+
+ private getLastTwoCharsOfTheCurTokenText(): string {
+ const text = this.curToken!.text;
+ return text.length <= 2 ? text : text.slice(-2);
+ }
+
+ private trimLastCharAddPendingTokenSetCurToken(type: number, text: string, channel: number): void {
+ // Trim the last char and add the modified curToken to the pendingTokens stack
+ const tokenTextWithoutLastChar: string = this.curToken!.text.slice(0, -1);
+ this.curToken!.text = tokenTextWithoutLastChar;
+ this.curToken!.stop -= 1;
+ this.addPendingToken(this.curToken!);
+
+ this.createNewCurToken(type, text, channel); // Set curToken
+ }
+
+ private handleCOLONEQUALtokenInIString(): void { // ISTRING = interpolated string (FSTRING or TSTRING)
+ if (this.lexerModeStack.length > 0 &&
+ this.paren_or_bracket_openedStack.at(-1) === 0) { // stack peek === 0
+
+ // In an f-string, the walrus operator (:=) is only allowed inside parentheses.
+ // If used outside, split the COLONEQUAL token into a COLON
+ // (used as a format specifier instead of a walrus operator),
+ // and move the equal sign to the beginning of the next token (FSTRING_MIDDLE or TSTRING_MIDDLE).
+ this.curToken!.type = PythonLexer.COLON;
+ this.curToken!.text = ":";
+ this.curToken!.stop = this.curToken!.start;
+
+ switch (this.ffgToken!.type) {
+ case PythonLexer.FSTRING_MIDDLE:
+ case PythonLexer.TSTRING_MIDDLE: {
+ const token: Token = this.ffgToken!.clone();
+ token.text = "=" + token.text;
+ token.start -= 1;
+ token.column -= 1;
+ this.ffgToken = token;
+ break;
+ }
+ default: {
+ this.addPendingToken(this.curToken!);
+ this.createNewCurToken(this.curISTRING_MIDDLEtokenType, "=", Token.DEFAULT_CHANNEL);
+ }
+ }
+
+ }
+ this.addPendingToken(this.curToken!);
+ }
+
+ private createNewCurToken(type: number, text: string, channel: number): void {
+ const token: CommonToken = this.curToken!.clone();
+ token.type = type;
+ token.text = text;
+ token.channel = channel;
+ token.column += 1;
+ token.start += 1;
+ token.stop = token.start;
+ this.curToken = token;
+ }
+
+ private pushLexerMode(mode: number): void {
+ this.pushMode(mode);
+ this.lexerModeStack.push(this.curLexerMode);
+ this.curLexerMode = mode;
+ }
+
+ private popLexerMode(): void {
+ this.popMode();
+ this.curLexerMode = this.lexerModeStack.pop()!;
+ }
+
+ private handleFORMAT_SPECIFICATION_MODE(): void {
+ if (this.lexerModeStack.length == 0 || this.ffgToken!.type !== PythonLexer.RBRACE) {
+ return;
+ }
+
+ // insert an empty FSTRING_MIDDLE or TSTRING_MIDDLE token instead of the missing format specification
+ switch (this.curToken!.type) {
+ case PythonLexer.COLON:
+ this.createAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken!);
+ break;
+ case PythonLexer.RBRACE:
+ // only when the previous brace expression is not a dictionary comprehension or set comprehension
+ if (!this.isValid_DictionaryOrSet_ComprehensionExpression(this.prevBraceExpression)) {
+ this.createAndAddPendingToken(this.curISTRING_MIDDLEtokenType, "", this.ffgToken!);
+ }
+ break;
+ }
+ }
+
+ private isValid_DictionaryOrSet_ComprehensionExpression(code: string): boolean {
+ const inputStream: CharStream = CharStreams.fromString(code);
+ const lexer: PythonLexer = new PythonLexer(inputStream);
+ const tokenStream: CommonTokenStream = new CommonTokenStream(lexer);
+ let parser = new PythonParser(tokenStream);
+
+ // Disable error listeners to suppress console output
+ lexer.removeErrorListeners();
+ parser.removeErrorListeners();
+
+ parser.dictcomp(); // Try parsing as dictionary comprehension
+ if (parser.syntaxErrorsCount === 0) {
+ return true;
+ }
+
+ parser = new PythonParser(tokenStream);
+ (tokenStream as any).seek(0); // seek method is not declared in CommonTokenStream.d.ts
+ parser.removeErrorListeners();
+ parser.setcomp(); // Try parsing as set comprehension
+ return parser.syntaxErrorsCount === 0;
+ }
+
+ private insertTrailingTokens(): void {
+ switch (this.lastPendingTokenTypeFromDefaultChannel) {
+ case PythonLexer.NEWLINE:
+ case PythonLexer.DEDENT:
+ break; // no trailing NEWLINE token is needed
+ default:
+ // insert an extra trailing NEWLINE token that serves as the end of the last statement
+ this.createAndAddPendingToken(PythonLexer.NEWLINE, null, this.ffgToken!); // ffgToken is EOF
+ }
+ this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
+ }
+
+ private handleEOFtoken(): void {
+ if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
+ // there was a statement in the input (leading NEWLINE tokens are hidden)
+ this.insertTrailingTokens();
+ }
+ this.addPendingToken(this.curToken!);
+ }
+
+ private hideAndAddPendingToken(originalToken: Token): void {
+ originalToken.channel = Token.HIDDEN_CHANNEL;
+ this.addPendingToken(originalToken);
+ }
+
+ private createAndAddPendingToken(type: number, text: string | null, originalToken: Token): void {
+ const token: Token = originalToken.clone();
+ token.type = type;
+ token.channel = Token.DEFAULT_CHANNEL;
+ token.stop = originalToken.start - 1;
+ token.text = text == null ?
+ `<${PythonLexer.symbolicNames[type] ?? ""}>` :
+ text;
+
+ this.addPendingToken(token);
+ }
+
+ private addPendingToken(token: Token): void {
+ // save the last pending token type because the pendingTokens list can be empty by the nextToken()
+ this.previousPendingTokenType = token.type;
+ if (token.channel === Token.DEFAULT_CHANNEL) {
+ this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
+ }
+ this.pendingTokens.push(token) /* .addLast(token) */;
+ }
+
+ private getIndentationLength(indentText: string): number { // the indentText may contain spaces, tabs or form feeds
+ let length: number = 0;
+ for (let ch of indentText) {
+ switch (ch) {
+ case " ":
+ this.wasSpaceIndentation = true;
+ length += 1;
+ break;
+ case "\t":
+ this.wasTabIndentation = true;
+ length += PythonLexerBase.TAB_LENGTH - (length % PythonLexerBase.TAB_LENGTH);
+ break;
+ case "\f": // form feed
+ length = 0;
+ break;
+ }
+ }
+
+ if (this.wasTabIndentation && this.wasSpaceIndentation) {
+ if (!this.wasIndentationMixedWithSpacesAndTabs) {
+ this.wasIndentationMixedWithSpacesAndTabs = true;
+ length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent
+ }
+ }
+ return length;
+ }
+
+ private reportLexerError(errMsg: string): void {
+ this.getErrorListener().syntaxError(this, this.curToken!.type, this.curToken!.line, this.curToken!.column, " LEXER" + PythonLexerBase.ERR_TXT + errMsg, undefined);
+ }
+
+ private reportError(errMsg: string): void {
+ this.reportLexerError(errMsg);
+
+ this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken!);
+ // the ERRORTOKEN also triggers a parser error
+ }
+}
\ No newline at end of file
diff --git a/python/python3_13/changes.md b/python/python3_14/changes.md
similarity index 82%
rename from python/python3_13/changes.md
rename to python/python3_14/changes.md
index 7934d3757b..b9ceeee61c 100644
--- a/python/python3_13/changes.md
+++ b/python/python3_14/changes.md
@@ -1,3 +1,9 @@
+# Dec. 24, 2025
+- parser grammar update for Python 3.14.2
+- tokenizing t-string literals
+- tokenizing BOM Unicode character at the start of the file so it is skipped in the token stream
+- moved encoding detection from PythonLexerBase to a separate component
+
# Jan. 07, 2025
- parser grammar update for Python 3.13.1
- added ENCODING token
diff --git a/python/python3_13/desc.xml b/python/python3_14/desc.xml
similarity index 100%
rename from python/python3_13/desc.xml
rename to python/python3_14/desc.xml
diff --git a/python/python3_13/examples/__future__.py b/python/python3_14/examples/__future__.py
similarity index 100%
rename from python/python3_13/examples/__future__.py
rename to python/python3_14/examples/__future__.py
diff --git a/python/python3_13/examples/__hello__.py b/python/python3_14/examples/__hello__.py
similarity index 100%
rename from python/python3_13/examples/__hello__.py
rename to python/python3_14/examples/__hello__.py
diff --git a/python/python3_13/examples/_aix_support.py b/python/python3_14/examples/_aix_support.py
similarity index 100%
rename from python/python3_13/examples/_aix_support.py
rename to python/python3_14/examples/_aix_support.py
diff --git a/python/python3_13/examples/_android_support.py b/python/python3_14/examples/_android_support.py
similarity index 93%
rename from python/python3_13/examples/_android_support.py
rename to python/python3_14/examples/_android_support.py
index 7572745c85..a439d03a14 100644
--- a/python/python3_13/examples/_android_support.py
+++ b/python/python3_14/examples/_android_support.py
@@ -6,7 +6,7 @@
# The maximum length of a log message in bytes, including the level marker and
# tag, is defined as LOGGER_ENTRY_MAX_PAYLOAD at
# https://cs.android.com/android/platform/superproject/+/android-14.0.0_r1:system/logging/liblog/include/log/log.h;l=71.
-# Messages longer than this will be be truncated by logcat. This limit has already
+# Messages longer than this will be truncated by logcat. This limit has already
# been reduced at least once in the history of Android (from 4076 to 4068 between
# API level 23 and 26), so leave some headroom.
MAX_BYTES_PER_WRITE = 4000
@@ -29,15 +29,19 @@ def init_streams(android_log_write, stdout_prio, stderr_prio):
global logcat
logcat = Logcat(android_log_write)
-
- sys.stdout = TextLogStream(
- stdout_prio, "python.stdout", sys.stdout.fileno())
- sys.stderr = TextLogStream(
- stderr_prio, "python.stderr", sys.stderr.fileno())
+ sys.stdout = TextLogStream(stdout_prio, "python.stdout", sys.stdout)
+ sys.stderr = TextLogStream(stderr_prio, "python.stderr", sys.stderr)
class TextLogStream(io.TextIOWrapper):
- def __init__(self, prio, tag, fileno=None, **kwargs):
+ def __init__(self, prio, tag, original=None, **kwargs):
+ # Respect the -u option.
+ if original:
+ kwargs.setdefault("write_through", original.write_through)
+ fileno = original.fileno()
+ else:
+ fileno = None
+
# The default is surrogateescape for stdout and backslashreplace for
# stderr, but in the context of an Android log, readability is more
# important than reversibility.
diff --git a/python/python3_14/examples/_apple_support.py b/python/python3_14/examples/_apple_support.py
new file mode 100644
index 0000000000..92febdcf58
--- /dev/null
+++ b/python/python3_14/examples/_apple_support.py
@@ -0,0 +1,66 @@
+import io
+import sys
+
+
+def init_streams(log_write, stdout_level, stderr_level):
+ # Redirect stdout and stderr to the Apple system log. This method is
+ # invoked by init_apple_streams() (initconfig.c) if config->use_system_logger
+ # is enabled.
+ sys.stdout = SystemLog(log_write, stdout_level, errors=sys.stderr.errors)
+ sys.stderr = SystemLog(log_write, stderr_level, errors=sys.stderr.errors)
+
+
+class SystemLog(io.TextIOWrapper):
+ def __init__(self, log_write, level, **kwargs):
+ kwargs.setdefault("encoding", "UTF-8")
+ kwargs.setdefault("line_buffering", True)
+ super().__init__(LogStream(log_write, level), **kwargs)
+
+ def __repr__(self):
+ return f""
+
+ def write(self, s):
+ if not isinstance(s, str):
+ raise TypeError(
+ f"write() argument must be str, not {type(s).__name__}")
+
+ # In case `s` is a str subclass that writes itself to stdout or stderr
+ # when we call its methods, convert it to an actual str.
+ s = str.__str__(s)
+
+ # We want to emit one log message per line, so split
+ # the string before sending it to the superclass.
+ for line in s.splitlines(keepends=True):
+ super().write(line)
+
+ return len(s)
+
+
+class LogStream(io.RawIOBase):
+ def __init__(self, log_write, level):
+ self.log_write = log_write
+ self.level = level
+
+ def __repr__(self):
+ return f""
+
+ def writable(self):
+ return True
+
+ def write(self, b):
+ if type(b) is not bytes:
+ try:
+ b = bytes(memoryview(b))
+ except TypeError:
+ raise TypeError(
+ f"write() argument must be bytes-like, not {type(b).__name__}"
+ ) from None
+
+ # Writing an empty string to the stream should have no effect.
+ if b:
+ # Encode null bytes using "modified UTF-8" to avoid truncating the
+ # message. This should not affect the return value, as the caller
+ # may be expecting it to match the length of the input.
+ self.log_write(self.level, b.replace(b"\x00", b"\xc0\x80"))
+
+ return len(b)
diff --git a/python/python3_14/examples/_ast_unparse.py b/python/python3_14/examples/_ast_unparse.py
new file mode 100644
index 0000000000..1c8741b5a5
--- /dev/null
+++ b/python/python3_14/examples/_ast_unparse.py
@@ -0,0 +1,1161 @@
+# This module contains ``ast.unparse()``, defined here
+# to improve the import time for the ``ast`` module.
+import sys
+from _ast import *
+from ast import NodeVisitor
+from contextlib import contextmanager, nullcontext
+from enum import IntEnum, auto, _simple_enum
+
+# Large float and imaginary literals get turned into infinities in the AST.
+# We unparse those infinities to INFSTR.
+_INFSTR = "1e" + repr(sys.float_info.max_10_exp + 1)
+
+@_simple_enum(IntEnum)
+class _Precedence:
+ """Precedence table that originated from python grammar."""
+
+ NAMED_EXPR = auto() # :=
+ TUPLE = auto() # ,
+ YIELD = auto() # 'yield', 'yield from'
+ TEST = auto() # 'if'-'else', 'lambda'
+ OR = auto() # 'or'
+ AND = auto() # 'and'
+ NOT = auto() # 'not'
+ CMP = auto() # '<', '>', '==', '>=', '<=', '!=',
+ # 'in', 'not in', 'is', 'is not'
+ EXPR = auto()
+ BOR = EXPR # '|'
+ BXOR = auto() # '^'
+ BAND = auto() # '&'
+ SHIFT = auto() # '<<', '>>'
+ ARITH = auto() # '+', '-'
+ TERM = auto() # '*', '@', '/', '%', '//'
+ FACTOR = auto() # unary '+', '-', '~'
+ POWER = auto() # '**'
+ AWAIT = auto() # 'await'
+ ATOM = auto()
+
+ def next(self):
+ try:
+ return self.__class__(self + 1)
+ except ValueError:
+ return self
+
+
+_SINGLE_QUOTES = ("'", '"')
+_MULTI_QUOTES = ('"""', "'''")
+_ALL_QUOTES = (*_SINGLE_QUOTES, *_MULTI_QUOTES)
+
+class Unparser(NodeVisitor):
+ """Methods in this class recursively traverse an AST and
+ output source code for the abstract syntax; original formatting
+ is disregarded."""
+
+ def __init__(self):
+ self._source = []
+ self._precedences = {}
+ self._type_ignores = {}
+ self._indent = 0
+ self._in_try_star = False
+ self._in_interactive = False
+
+ def interleave(self, inter, f, seq):
+ """Call f on each item in seq, calling inter() in between."""
+ seq = iter(seq)
+ try:
+ f(next(seq))
+ except StopIteration:
+ pass
+ else:
+ for x in seq:
+ inter()
+ f(x)
+
+ def items_view(self, traverser, items):
+ """Traverse and separate the given *items* with a comma and append it to
+ the buffer. If *items* is a single item sequence, a trailing comma
+ will be added."""
+ if len(items) == 1:
+ traverser(items[0])
+ self.write(",")
+ else:
+ self.interleave(lambda: self.write(", "), traverser, items)
+
+ def maybe_newline(self):
+ """Adds a newline if it isn't the start of generated source"""
+ if self._source:
+ self.write("\n")
+
+ def maybe_semicolon(self):
+ """Adds a "; " delimiter if it isn't the start of generated source"""
+ if self._source:
+ self.write("; ")
+
+ def fill(self, text="", *, allow_semicolon=True):
+ """Indent a piece of text and append it, according to the current
+ indentation level, or only delineate with semicolon if applicable"""
+ if self._in_interactive and not self._indent and allow_semicolon:
+ self.maybe_semicolon()
+ self.write(text)
+ else:
+ self.maybe_newline()
+ self.write(" " * self._indent + text)
+
+ def write(self, *text):
+ """Add new source parts"""
+ self._source.extend(text)
+
+ @contextmanager
+ def buffered(self, buffer = None):
+ if buffer is None:
+ buffer = []
+
+ original_source = self._source
+ self._source = buffer
+ yield buffer
+ self._source = original_source
+
+ @contextmanager
+ def block(self, *, extra = None):
+ """A context manager for preparing the source for blocks. It adds
+ the character':', increases the indentation on enter and decreases
+ the indentation on exit. If *extra* is given, it will be directly
+ appended after the colon character.
+ """
+ self.write(":")
+ if extra:
+ self.write(extra)
+ self._indent += 1
+ yield
+ self._indent -= 1
+
+ @contextmanager
+ def delimit(self, start, end):
+ """A context manager for preparing the source for expressions. It adds
+ *start* to the buffer and enters, after exit it adds *end*."""
+
+ self.write(start)
+ yield
+ self.write(end)
+
+ def delimit_if(self, start, end, condition):
+ if condition:
+ return self.delimit(start, end)
+ else:
+ return nullcontext()
+
+ def require_parens(self, precedence, node):
+ """Shortcut to adding precedence related parens"""
+ return self.delimit_if("(", ")", self.get_precedence(node) > precedence)
+
+ def get_precedence(self, node):
+ return self._precedences.get(node, _Precedence.TEST)
+
+ def set_precedence(self, precedence, *nodes):
+ for node in nodes:
+ self._precedences[node] = precedence
+
+ def get_raw_docstring(self, node):
+ """If a docstring node is found in the body of the *node* parameter,
+ return that docstring node, None otherwise.
+
+ Logic mirrored from ``_PyAST_GetDocString``."""
+ if not isinstance(
+ node, (AsyncFunctionDef, FunctionDef, ClassDef, Module)
+ ) or len(node.body) < 1:
+ return None
+ node = node.body[0]
+ if not isinstance(node, Expr):
+ return None
+ node = node.value
+ if isinstance(node, Constant) and isinstance(node.value, str):
+ return node
+
+ def get_type_comment(self, node):
+ comment = self._type_ignores.get(node.lineno) or node.type_comment
+ if comment is not None:
+ return f" # type: {comment}"
+
+ def traverse(self, node):
+ if isinstance(node, list):
+ for item in node:
+ self.traverse(item)
+ else:
+ super().visit(node)
+
+ # Note: as visit() resets the output text, do NOT rely on
+ # NodeVisitor.generic_visit to handle any nodes (as it calls back in to
+ # the subclass visit() method, which resets self._source to an empty list)
+ def visit(self, node):
+ """Outputs a source code string that, if converted back to an ast
+ (using ast.parse) will generate an AST equivalent to *node*"""
+ self._source = []
+ self.traverse(node)
+ return "".join(self._source)
+
+ def _write_docstring_and_traverse_body(self, node):
+ if (docstring := self.get_raw_docstring(node)):
+ self._write_docstring(docstring)
+ self.traverse(node.body[1:])
+ else:
+ self.traverse(node.body)
+
+ def visit_Module(self, node):
+ self._type_ignores = {
+ ignore.lineno: f"ignore{ignore.tag}"
+ for ignore in node.type_ignores
+ }
+ try:
+ self._write_docstring_and_traverse_body(node)
+ finally:
+ self._type_ignores.clear()
+
+ def visit_Interactive(self, node):
+ self._in_interactive = True
+ try:
+ self._write_docstring_and_traverse_body(node)
+ finally:
+ self._in_interactive = False
+
+ def visit_FunctionType(self, node):
+ with self.delimit("(", ")"):
+ self.interleave(
+ lambda: self.write(", "), self.traverse, node.argtypes
+ )
+
+ self.write(" -> ")
+ self.traverse(node.returns)
+
+ def visit_Expr(self, node):
+ self.fill()
+ self.set_precedence(_Precedence.YIELD, node.value)
+ self.traverse(node.value)
+
+ def visit_NamedExpr(self, node):
+ with self.require_parens(_Precedence.NAMED_EXPR, node):
+ self.set_precedence(_Precedence.ATOM, node.target, node.value)
+ self.traverse(node.target)
+ self.write(" := ")
+ self.traverse(node.value)
+
+ def visit_Import(self, node):
+ self.fill("import ")
+ self.interleave(lambda: self.write(", "), self.traverse, node.names)
+
+ def visit_ImportFrom(self, node):
+ self.fill("from ")
+ self.write("." * (node.level or 0))
+ if node.module:
+ self.write(node.module)
+ self.write(" import ")
+ self.interleave(lambda: self.write(", "), self.traverse, node.names)
+
+ def visit_Assign(self, node):
+ self.fill()
+ for target in node.targets:
+ self.set_precedence(_Precedence.TUPLE, target)
+ self.traverse(target)
+ self.write(" = ")
+ self.traverse(node.value)
+ if type_comment := self.get_type_comment(node):
+ self.write(type_comment)
+
+ def visit_AugAssign(self, node):
+ self.fill()
+ self.traverse(node.target)
+ self.write(" " + self.binop[node.op.__class__.__name__] + "= ")
+ self.traverse(node.value)
+
+ def visit_AnnAssign(self, node):
+ self.fill()
+ with self.delimit_if("(", ")", not node.simple and isinstance(node.target, Name)):
+ self.traverse(node.target)
+ self.write(": ")
+ self.traverse(node.annotation)
+ if node.value:
+ self.write(" = ")
+ self.traverse(node.value)
+
+ def visit_Return(self, node):
+ self.fill("return")
+ if node.value:
+ self.write(" ")
+ self.traverse(node.value)
+
+ def visit_Pass(self, node):
+ self.fill("pass")
+
+ def visit_Break(self, node):
+ self.fill("break")
+
+ def visit_Continue(self, node):
+ self.fill("continue")
+
+ def visit_Delete(self, node):
+ self.fill("del ")
+ self.interleave(lambda: self.write(", "), self.traverse, node.targets)
+
+ def visit_Assert(self, node):
+ self.fill("assert ")
+ self.traverse(node.test)
+ if node.msg:
+ self.write(", ")
+ self.traverse(node.msg)
+
+ def visit_Global(self, node):
+ self.fill("global ")
+ self.interleave(lambda: self.write(", "), self.write, node.names)
+
+ def visit_Nonlocal(self, node):
+ self.fill("nonlocal ")
+ self.interleave(lambda: self.write(", "), self.write, node.names)
+
+ def visit_Await(self, node):
+ with self.require_parens(_Precedence.AWAIT, node):
+ self.write("await")
+ if node.value:
+ self.write(" ")
+ self.set_precedence(_Precedence.ATOM, node.value)
+ self.traverse(node.value)
+
+ def visit_Yield(self, node):
+ with self.require_parens(_Precedence.YIELD, node):
+ self.write("yield")
+ if node.value:
+ self.write(" ")
+ self.set_precedence(_Precedence.ATOM, node.value)
+ self.traverse(node.value)
+
+ def visit_YieldFrom(self, node):
+ with self.require_parens(_Precedence.YIELD, node):
+ self.write("yield from ")
+ if not node.value:
+ raise ValueError("Node can't be used without a value attribute.")
+ self.set_precedence(_Precedence.ATOM, node.value)
+ self.traverse(node.value)
+
+ def visit_Raise(self, node):
+ self.fill("raise")
+ if not node.exc:
+ if node.cause:
+ raise ValueError(f"Node can't use cause without an exception.")
+ return
+ self.write(" ")
+ self.traverse(node.exc)
+ if node.cause:
+ self.write(" from ")
+ self.traverse(node.cause)
+
+ def do_visit_try(self, node):
+ self.fill("try", allow_semicolon=False)
+ with self.block():
+ self.traverse(node.body)
+ for ex in node.handlers:
+ self.traverse(ex)
+ if node.orelse:
+ self.fill("else", allow_semicolon=False)
+ with self.block():
+ self.traverse(node.orelse)
+ if node.finalbody:
+ self.fill("finally", allow_semicolon=False)
+ with self.block():
+ self.traverse(node.finalbody)
+
+ def visit_Try(self, node):
+ prev_in_try_star = self._in_try_star
+ try:
+ self._in_try_star = False
+ self.do_visit_try(node)
+ finally:
+ self._in_try_star = prev_in_try_star
+
+ def visit_TryStar(self, node):
+ prev_in_try_star = self._in_try_star
+ try:
+ self._in_try_star = True
+ self.do_visit_try(node)
+ finally:
+ self._in_try_star = prev_in_try_star
+
+ def visit_ExceptHandler(self, node):
+ self.fill("except*" if self._in_try_star else "except", allow_semicolon=False)
+ if node.type:
+ self.write(" ")
+ self.traverse(node.type)
+ if node.name:
+ self.write(" as ")
+ self.write(node.name)
+ with self.block():
+ self.traverse(node.body)
+
+ def visit_ClassDef(self, node):
+ self.maybe_newline()
+ for deco in node.decorator_list:
+ self.fill("@", allow_semicolon=False)
+ self.traverse(deco)
+ self.fill("class " + node.name, allow_semicolon=False)
+ if hasattr(node, "type_params"):
+ self._type_params_helper(node.type_params)
+ with self.delimit_if("(", ")", condition = node.bases or node.keywords):
+ comma = False
+ for e in node.bases:
+ if comma:
+ self.write(", ")
+ else:
+ comma = True
+ self.traverse(e)
+ for e in node.keywords:
+ if comma:
+ self.write(", ")
+ else:
+ comma = True
+ self.traverse(e)
+
+ with self.block():
+ self._write_docstring_and_traverse_body(node)
+
+ def visit_FunctionDef(self, node):
+ self._function_helper(node, "def")
+
+ def visit_AsyncFunctionDef(self, node):
+ self._function_helper(node, "async def")
+
+ def _function_helper(self, node, fill_suffix):
+ self.maybe_newline()
+ for deco in node.decorator_list:
+ self.fill("@", allow_semicolon=False)
+ self.traverse(deco)
+ def_str = fill_suffix + " " + node.name
+ self.fill(def_str, allow_semicolon=False)
+ if hasattr(node, "type_params"):
+ self._type_params_helper(node.type_params)
+ with self.delimit("(", ")"):
+ self.traverse(node.args)
+ if node.returns:
+ self.write(" -> ")
+ self.traverse(node.returns)
+ with self.block(extra=self.get_type_comment(node)):
+ self._write_docstring_and_traverse_body(node)
+
+ def _type_params_helper(self, type_params):
+ if type_params is not None and len(type_params) > 0:
+ with self.delimit("[", "]"):
+ self.interleave(lambda: self.write(", "), self.traverse, type_params)
+
+ def visit_TypeVar(self, node):
+ self.write(node.name)
+ if node.bound:
+ self.write(": ")
+ self.traverse(node.bound)
+ if node.default_value:
+ self.write(" = ")
+ self.traverse(node.default_value)
+
+ def visit_TypeVarTuple(self, node):
+ self.write("*" + node.name)
+ if node.default_value:
+ self.write(" = ")
+ self.traverse(node.default_value)
+
+ def visit_ParamSpec(self, node):
+ self.write("**" + node.name)
+ if node.default_value:
+ self.write(" = ")
+ self.traverse(node.default_value)
+
+ def visit_TypeAlias(self, node):
+ self.fill("type ")
+ self.traverse(node.name)
+ self._type_params_helper(node.type_params)
+ self.write(" = ")
+ self.traverse(node.value)
+
+ def visit_For(self, node):
+ self._for_helper("for ", node)
+
+ def visit_AsyncFor(self, node):
+ self._for_helper("async for ", node)
+
+ def _for_helper(self, fill, node):
+ self.fill(fill, allow_semicolon=False)
+ self.set_precedence(_Precedence.TUPLE, node.target)
+ self.traverse(node.target)
+ self.write(" in ")
+ self.traverse(node.iter)
+ with self.block(extra=self.get_type_comment(node)):
+ self.traverse(node.body)
+ if node.orelse:
+ self.fill("else", allow_semicolon=False)
+ with self.block():
+ self.traverse(node.orelse)
+
+ def visit_If(self, node):
+ self.fill("if ", allow_semicolon=False)
+ self.traverse(node.test)
+ with self.block():
+ self.traverse(node.body)
+ # collapse nested ifs into equivalent elifs.
+ while node.orelse and len(node.orelse) == 1 and isinstance(node.orelse[0], If):
+ node = node.orelse[0]
+ self.fill("elif ", allow_semicolon=False)
+ self.traverse(node.test)
+ with self.block():
+ self.traverse(node.body)
+ # final else
+ if node.orelse:
+ self.fill("else", allow_semicolon=False)
+ with self.block():
+ self.traverse(node.orelse)
+
+ def visit_While(self, node):
+ self.fill("while ", allow_semicolon=False)
+ self.traverse(node.test)
+ with self.block():
+ self.traverse(node.body)
+ if node.orelse:
+ self.fill("else", allow_semicolon=False)
+ with self.block():
+ self.traverse(node.orelse)
+
+ def visit_With(self, node):
+ self.fill("with ", allow_semicolon=False)
+ self.interleave(lambda: self.write(", "), self.traverse, node.items)
+ with self.block(extra=self.get_type_comment(node)):
+ self.traverse(node.body)
+
+ def visit_AsyncWith(self, node):
+ self.fill("async with ", allow_semicolon=False)
+ self.interleave(lambda: self.write(", "), self.traverse, node.items)
+ with self.block(extra=self.get_type_comment(node)):
+ self.traverse(node.body)
+
+ def _str_literal_helper(
+ self, string, *, quote_types=_ALL_QUOTES, escape_special_whitespace=False
+ ):
+ """Helper for writing string literals, minimizing escapes.
+ Returns the tuple (string literal to write, possible quote types).
+ """
+ def escape_char(c):
+ # \n and \t are non-printable, but we only escape them if
+ # escape_special_whitespace is True
+ if not escape_special_whitespace and c in "\n\t":
+ return c
+ # Always escape backslashes and other non-printable characters
+ if c == "\\" or not c.isprintable():
+ return c.encode("unicode_escape").decode("ascii")
+ return c
+
+ escaped_string = "".join(map(escape_char, string))
+ possible_quotes = quote_types
+ if "\n" in escaped_string:
+ possible_quotes = [q for q in possible_quotes if q in _MULTI_QUOTES]
+ possible_quotes = [q for q in possible_quotes if q not in escaped_string]
+ if not possible_quotes:
+ # If there aren't any possible_quotes, fallback to using repr
+ # on the original string. Try to use a quote from quote_types,
+ # e.g., so that we use triple quotes for docstrings.
+ string = repr(string)
+ quote = next((q for q in quote_types if string[0] in q), string[0])
+ return string[1:-1], [quote]
+ if escaped_string:
+ # Sort so that we prefer '''"''' over """\""""
+ possible_quotes.sort(key=lambda q: q[0] == escaped_string[-1])
+ # If we're using triple quotes and we'd need to escape a final
+ # quote, escape it
+ if possible_quotes[0][0] == escaped_string[-1]:
+ assert len(possible_quotes[0]) == 3
+ escaped_string = escaped_string[:-1] + "\\" + escaped_string[-1]
+ return escaped_string, possible_quotes
+
+ def _write_str_avoiding_backslashes(self, string, *, quote_types=_ALL_QUOTES):
+ """Write string literal value with a best effort attempt to avoid backslashes."""
+ string, quote_types = self._str_literal_helper(string, quote_types=quote_types)
+ quote_type = quote_types[0]
+ self.write(f"{quote_type}{string}{quote_type}")
+
+ def _ftstring_helper(self, parts):
+ new_parts = []
+ quote_types = list(_ALL_QUOTES)
+ fallback_to_repr = False
+ for value, is_constant in parts:
+ if is_constant:
+ value, new_quote_types = self._str_literal_helper(
+ value,
+ quote_types=quote_types,
+ escape_special_whitespace=True,
+ )
+ if set(new_quote_types).isdisjoint(quote_types):
+ fallback_to_repr = True
+ break
+ quote_types = new_quote_types
+ else:
+ if "\n" in value:
+ quote_types = [q for q in quote_types if q in _MULTI_QUOTES]
+ assert quote_types
+
+ new_quote_types = [q for q in quote_types if q not in value]
+ if new_quote_types:
+ quote_types = new_quote_types
+ new_parts.append(value)
+
+ if fallback_to_repr:
+ # If we weren't able to find a quote type that works for all parts
+ # of the JoinedStr, fallback to using repr and triple single quotes.
+ quote_types = ["'''"]
+ new_parts.clear()
+ for value, is_constant in parts:
+ if is_constant:
+ value = repr('"' + value) # force repr to use single quotes
+ expected_prefix = "'\""
+ assert value.startswith(expected_prefix), repr(value)
+ value = value[len(expected_prefix):-1]
+ new_parts.append(value)
+
+ value = "".join(new_parts)
+ quote_type = quote_types[0]
+ self.write(f"{quote_type}{value}{quote_type}")
+
+ def _write_ftstring(self, values, prefix):
+ self.write(prefix)
+ fstring_parts = []
+ for value in values:
+ with self.buffered() as buffer:
+ self._write_ftstring_inner(value)
+ fstring_parts.append(
+ ("".join(buffer), isinstance(value, Constant))
+ )
+ self._ftstring_helper(fstring_parts)
+
+ def visit_JoinedStr(self, node):
+ self._write_ftstring(node.values, "f")
+
+ def visit_TemplateStr(self, node):
+ self._write_ftstring(node.values, "t")
+
+ def _write_ftstring_inner(self, node, is_format_spec=False):
+ if isinstance(node, JoinedStr):
+ # for both the f-string itself, and format_spec
+ for value in node.values:
+ self._write_ftstring_inner(value, is_format_spec=is_format_spec)
+ elif isinstance(node, Constant) and isinstance(node.value, str):
+ value = node.value.replace("{", "{{").replace("}", "}}")
+
+ if is_format_spec:
+ value = value.replace("\\", "\\\\")
+ value = value.replace("'", "\\'")
+ value = value.replace('"', '\\"')
+ value = value.replace("\n", "\\n")
+ self.write(value)
+ elif isinstance(node, FormattedValue):
+ self.visit_FormattedValue(node)
+ elif isinstance(node, Interpolation):
+ self.visit_Interpolation(node)
+ else:
+ raise ValueError(f"Unexpected node inside JoinedStr, {node!r}")
+
+ def _unparse_interpolation_value(self, inner):
+ unparser = type(self)()
+ unparser.set_precedence(_Precedence.TEST.next(), inner)
+ return unparser.visit(inner)
+
+ def _write_interpolation(self, node, use_str_attr=False):
+ with self.delimit("{", "}"):
+ if use_str_attr:
+ expr = node.str
+ else:
+ expr = self._unparse_interpolation_value(node.value)
+ if expr.startswith("{"):
+ # Separate pair of opening brackets as "{ {"
+ self.write(" ")
+ self.write(expr)
+ if node.conversion != -1:
+ self.write(f"!{chr(node.conversion)}")
+ if node.format_spec:
+ self.write(":")
+ self._write_ftstring_inner(node.format_spec, is_format_spec=True)
+
+ def visit_FormattedValue(self, node):
+ self._write_interpolation(node)
+
+ def visit_Interpolation(self, node):
+ # If `str` is set to `None`, use the `value` to generate the source code.
+ self._write_interpolation(node, use_str_attr=node.str is not None)
+
+ def visit_Name(self, node):
+ self.write(node.id)
+
+ def _write_docstring(self, node):
+ self.fill(allow_semicolon=False)
+ if node.kind == "u":
+ self.write("u")
+ self._write_str_avoiding_backslashes(node.value, quote_types=_MULTI_QUOTES)
+
+ def _write_constant(self, value):
+ if isinstance(value, (float, complex)):
+ # Substitute overflowing decimal literal for AST infinities,
+ # and inf - inf for NaNs.
+ self.write(
+ repr(value)
+ .replace("inf", _INFSTR)
+ .replace("nan", f"({_INFSTR}-{_INFSTR})")
+ )
+ else:
+ self.write(repr(value))
+
+ def visit_Constant(self, node):
+ value = node.value
+ if isinstance(value, tuple):
+ with self.delimit("(", ")"):
+ self.items_view(self._write_constant, value)
+ elif value is ...:
+ self.write("...")
+ else:
+ if node.kind == "u":
+ self.write("u")
+ self._write_constant(node.value)
+
+ def visit_List(self, node):
+ with self.delimit("[", "]"):
+ self.interleave(lambda: self.write(", "), self.traverse, node.elts)
+
+ def visit_ListComp(self, node):
+ with self.delimit("[", "]"):
+ self.traverse(node.elt)
+ for gen in node.generators:
+ self.traverse(gen)
+
+ def visit_GeneratorExp(self, node):
+ with self.delimit("(", ")"):
+ self.traverse(node.elt)
+ for gen in node.generators:
+ self.traverse(gen)
+
+ def visit_SetComp(self, node):
+ with self.delimit("{", "}"):
+ self.traverse(node.elt)
+ for gen in node.generators:
+ self.traverse(gen)
+
+ def visit_DictComp(self, node):
+ with self.delimit("{", "}"):
+ self.traverse(node.key)
+ self.write(": ")
+ self.traverse(node.value)
+ for gen in node.generators:
+ self.traverse(gen)
+
+ def visit_comprehension(self, node):
+ if node.is_async:
+ self.write(" async for ")
+ else:
+ self.write(" for ")
+ self.set_precedence(_Precedence.TUPLE, node.target)
+ self.traverse(node.target)
+ self.write(" in ")
+ self.set_precedence(_Precedence.TEST.next(), node.iter, *node.ifs)
+ self.traverse(node.iter)
+ for if_clause in node.ifs:
+ self.write(" if ")
+ self.traverse(if_clause)
+
+ def visit_IfExp(self, node):
+ with self.require_parens(_Precedence.TEST, node):
+ self.set_precedence(_Precedence.TEST.next(), node.body, node.test)
+ self.traverse(node.body)
+ self.write(" if ")
+ self.traverse(node.test)
+ self.write(" else ")
+ self.set_precedence(_Precedence.TEST, node.orelse)
+ self.traverse(node.orelse)
+
+ def visit_Set(self, node):
+ if node.elts:
+ with self.delimit("{", "}"):
+ self.interleave(lambda: self.write(", "), self.traverse, node.elts)
+ else:
+ # `{}` would be interpreted as a dictionary literal, and
+ # `set` might be shadowed. Thus:
+ self.write('{*()}')
+
+ def visit_Dict(self, node):
+ def write_key_value_pair(k, v):
+ self.traverse(k)
+ self.write(": ")
+ self.traverse(v)
+
+ def write_item(item):
+ k, v = item
+ if k is None:
+ # for dictionary unpacking operator in dicts {**{'y': 2}}
+ # see PEP 448 for details
+ self.write("**")
+ self.set_precedence(_Precedence.EXPR, v)
+ self.traverse(v)
+ else:
+ write_key_value_pair(k, v)
+
+ with self.delimit("{", "}"):
+ self.interleave(
+ lambda: self.write(", "), write_item, zip(node.keys, node.values)
+ )
+
+ def visit_Tuple(self, node):
+ with self.delimit_if(
+ "(",
+ ")",
+ len(node.elts) == 0 or self.get_precedence(node) > _Precedence.TUPLE
+ ):
+ self.items_view(self.traverse, node.elts)
+
+ unop = {"Invert": "~", "Not": "not", "UAdd": "+", "USub": "-"}
+ unop_precedence = {
+ "not": _Precedence.NOT,
+ "~": _Precedence.FACTOR,
+ "+": _Precedence.FACTOR,
+ "-": _Precedence.FACTOR,
+ }
+
+ def visit_UnaryOp(self, node):
+ operator = self.unop[node.op.__class__.__name__]
+ operator_precedence = self.unop_precedence[operator]
+ with self.require_parens(operator_precedence, node):
+ self.write(operator)
+ # factor prefixes (+, -, ~) shouldn't be separated
+ # from the value they belong, (e.g: +1 instead of + 1)
+ if operator_precedence is not _Precedence.FACTOR:
+ self.write(" ")
+ self.set_precedence(operator_precedence, node.operand)
+ self.traverse(node.operand)
+
+ binop = {
+ "Add": "+",
+ "Sub": "-",
+ "Mult": "*",
+ "MatMult": "@",
+ "Div": "/",
+ "Mod": "%",
+ "LShift": "<<",
+ "RShift": ">>",
+ "BitOr": "|",
+ "BitXor": "^",
+ "BitAnd": "&",
+ "FloorDiv": "//",
+ "Pow": "**",
+ }
+
+ binop_precedence = {
+ "+": _Precedence.ARITH,
+ "-": _Precedence.ARITH,
+ "*": _Precedence.TERM,
+ "@": _Precedence.TERM,
+ "/": _Precedence.TERM,
+ "%": _Precedence.TERM,
+ "<<": _Precedence.SHIFT,
+ ">>": _Precedence.SHIFT,
+ "|": _Precedence.BOR,
+ "^": _Precedence.BXOR,
+ "&": _Precedence.BAND,
+ "//": _Precedence.TERM,
+ "**": _Precedence.POWER,
+ }
+
+ binop_rassoc = frozenset(("**",))
+ def visit_BinOp(self, node):
+ operator = self.binop[node.op.__class__.__name__]
+ operator_precedence = self.binop_precedence[operator]
+ with self.require_parens(operator_precedence, node):
+ if operator in self.binop_rassoc:
+ left_precedence = operator_precedence.next()
+ right_precedence = operator_precedence
+ else:
+ left_precedence = operator_precedence
+ right_precedence = operator_precedence.next()
+
+ self.set_precedence(left_precedence, node.left)
+ self.traverse(node.left)
+ self.write(f" {operator} ")
+ self.set_precedence(right_precedence, node.right)
+ self.traverse(node.right)
+
+ cmpops = {
+ "Eq": "==",
+ "NotEq": "!=",
+ "Lt": "<",
+ "LtE": "<=",
+ "Gt": ">",
+ "GtE": ">=",
+ "Is": "is",
+ "IsNot": "is not",
+ "In": "in",
+ "NotIn": "not in",
+ }
+
+ def visit_Compare(self, node):
+ with self.require_parens(_Precedence.CMP, node):
+ self.set_precedence(_Precedence.CMP.next(), node.left, *node.comparators)
+ self.traverse(node.left)
+ for o, e in zip(node.ops, node.comparators):
+ self.write(" " + self.cmpops[o.__class__.__name__] + " ")
+ self.traverse(e)
+
+ boolops = {"And": "and", "Or": "or"}
+ boolop_precedence = {"and": _Precedence.AND, "or": _Precedence.OR}
+
+ def visit_BoolOp(self, node):
+ operator = self.boolops[node.op.__class__.__name__]
+ operator_precedence = self.boolop_precedence[operator]
+
+ def increasing_level_traverse(node):
+ nonlocal operator_precedence
+ operator_precedence = operator_precedence.next()
+ self.set_precedence(operator_precedence, node)
+ self.traverse(node)
+
+ with self.require_parens(operator_precedence, node):
+ s = f" {operator} "
+ self.interleave(lambda: self.write(s), increasing_level_traverse, node.values)
+
+ def visit_Attribute(self, node):
+ self.set_precedence(_Precedence.ATOM, node.value)
+ self.traverse(node.value)
+ # Special case: 3.__abs__() is a syntax error, so if node.value
+ # is an integer literal then we need to either parenthesize
+ # it or add an extra space to get 3 .__abs__().
+ if isinstance(node.value, Constant) and isinstance(node.value.value, int):
+ self.write(" ")
+ self.write(".")
+ self.write(node.attr)
+
+ def visit_Call(self, node):
+ self.set_precedence(_Precedence.ATOM, node.func)
+ self.traverse(node.func)
+ with self.delimit("(", ")"):
+ comma = False
+ for e in node.args:
+ if comma:
+ self.write(", ")
+ else:
+ comma = True
+ self.traverse(e)
+ for e in node.keywords:
+ if comma:
+ self.write(", ")
+ else:
+ comma = True
+ self.traverse(e)
+
+ def visit_Subscript(self, node):
+ def is_non_empty_tuple(slice_value):
+ return (
+ isinstance(slice_value, Tuple)
+ and slice_value.elts
+ )
+
+ self.set_precedence(_Precedence.ATOM, node.value)
+ self.traverse(node.value)
+ with self.delimit("[", "]"):
+ if is_non_empty_tuple(node.slice):
+ # parentheses can be omitted if the tuple isn't empty
+ self.items_view(self.traverse, node.slice.elts)
+ else:
+ self.traverse(node.slice)
+
+ def visit_Starred(self, node):
+ self.write("*")
+ self.set_precedence(_Precedence.EXPR, node.value)
+ self.traverse(node.value)
+
+ def visit_Ellipsis(self, node):
+ self.write("...")
+
+ def visit_Slice(self, node):
+ if node.lower:
+ self.traverse(node.lower)
+ self.write(":")
+ if node.upper:
+ self.traverse(node.upper)
+ if node.step:
+ self.write(":")
+ self.traverse(node.step)
+
+ def visit_Match(self, node):
+ self.fill("match ", allow_semicolon=False)
+ self.traverse(node.subject)
+ with self.block():
+ for case in node.cases:
+ self.traverse(case)
+
+ def visit_arg(self, node):
+ self.write(node.arg)
+ if node.annotation:
+ self.write(": ")
+ self.traverse(node.annotation)
+
+ def visit_arguments(self, node):
+ first = True
+ # normal arguments
+ all_args = node.posonlyargs + node.args
+ defaults = [None] * (len(all_args) - len(node.defaults)) + node.defaults
+ for index, elements in enumerate(zip(all_args, defaults), 1):
+ a, d = elements
+ if first:
+ first = False
+ else:
+ self.write(", ")
+ self.traverse(a)
+ if d:
+ self.write("=")
+ self.traverse(d)
+ if index == len(node.posonlyargs):
+ self.write(", /")
+
+ # varargs, or bare '*' if no varargs but keyword-only arguments present
+ if node.vararg or node.kwonlyargs:
+ if first:
+ first = False
+ else:
+ self.write(", ")
+ self.write("*")
+ if node.vararg:
+ self.write(node.vararg.arg)
+ if node.vararg.annotation:
+ self.write(": ")
+ self.traverse(node.vararg.annotation)
+
+ # keyword-only arguments
+ if node.kwonlyargs:
+ for a, d in zip(node.kwonlyargs, node.kw_defaults):
+ self.write(", ")
+ self.traverse(a)
+ if d:
+ self.write("=")
+ self.traverse(d)
+
+ # kwargs
+ if node.kwarg:
+ if first:
+ first = False
+ else:
+ self.write(", ")
+ self.write("**" + node.kwarg.arg)
+ if node.kwarg.annotation:
+ self.write(": ")
+ self.traverse(node.kwarg.annotation)
+
+ def visit_keyword(self, node):
+ if node.arg is None:
+ self.write("**")
+ else:
+ self.write(node.arg)
+ self.write("=")
+ self.traverse(node.value)
+
+ def visit_Lambda(self, node):
+ with self.require_parens(_Precedence.TEST, node):
+ self.write("lambda")
+ with self.buffered() as buffer:
+ self.traverse(node.args)
+ if buffer:
+ self.write(" ", *buffer)
+ self.write(": ")
+ self.set_precedence(_Precedence.TEST, node.body)
+ self.traverse(node.body)
+
+ def visit_alias(self, node):
+ self.write(node.name)
+ if node.asname:
+ self.write(" as " + node.asname)
+
+ def visit_withitem(self, node):
+ self.traverse(node.context_expr)
+ if node.optional_vars:
+ self.write(" as ")
+ self.traverse(node.optional_vars)
+
+ def visit_match_case(self, node):
+ self.fill("case ", allow_semicolon=False)
+ self.traverse(node.pattern)
+ if node.guard:
+ self.write(" if ")
+ self.traverse(node.guard)
+ with self.block():
+ self.traverse(node.body)
+
+ def visit_MatchValue(self, node):
+ self.traverse(node.value)
+
+ def visit_MatchSingleton(self, node):
+ self._write_constant(node.value)
+
+ def visit_MatchSequence(self, node):
+ with self.delimit("[", "]"):
+ self.interleave(
+ lambda: self.write(", "), self.traverse, node.patterns
+ )
+
+ def visit_MatchStar(self, node):
+ name = node.name
+ if name is None:
+ name = "_"
+ self.write(f"*{name}")
+
+ def visit_MatchMapping(self, node):
+ def write_key_pattern_pair(pair):
+ k, p = pair
+ self.traverse(k)
+ self.write(": ")
+ self.traverse(p)
+
+ with self.delimit("{", "}"):
+ keys = node.keys
+ self.interleave(
+ lambda: self.write(", "),
+ write_key_pattern_pair,
+ zip(keys, node.patterns, strict=True),
+ )
+ rest = node.rest
+ if rest is not None:
+ if keys:
+ self.write(", ")
+ self.write(f"**{rest}")
+
+ def visit_MatchClass(self, node):
+ self.set_precedence(_Precedence.ATOM, node.cls)
+ self.traverse(node.cls)
+ with self.delimit("(", ")"):
+ patterns = node.patterns
+ self.interleave(
+ lambda: self.write(", "), self.traverse, patterns
+ )
+ attrs = node.kwd_attrs
+ if attrs:
+ def write_attr_pattern(pair):
+ attr, pattern = pair
+ self.write(f"{attr}=")
+ self.traverse(pattern)
+
+ if patterns:
+ self.write(", ")
+ self.interleave(
+ lambda: self.write(", "),
+ write_attr_pattern,
+ zip(attrs, node.kwd_patterns, strict=True),
+ )
+
+ def visit_MatchAs(self, node):
+ name = node.name
+ pattern = node.pattern
+ if name is None:
+ self.write("_")
+ elif pattern is None:
+ self.write(node.name)
+ else:
+ with self.require_parens(_Precedence.TEST, node):
+ self.set_precedence(_Precedence.BOR, node.pattern)
+ self.traverse(node.pattern)
+ self.write(f" as {node.name}")
+
+ def visit_MatchOr(self, node):
+ with self.require_parens(_Precedence.BOR, node):
+ self.set_precedence(_Precedence.BOR.next(), *node.patterns)
+ self.interleave(lambda: self.write(" | "), self.traverse, node.patterns)
diff --git a/python/python3_13/examples/_collections_abc.py b/python/python3_14/examples/_collections_abc.py
similarity index 97%
rename from python/python3_13/examples/_collections_abc.py
rename to python/python3_14/examples/_collections_abc.py
index aebe9c8b64..241d40d574 100644
--- a/python/python3_13/examples/_collections_abc.py
+++ b/python/python3_14/examples/_collections_abc.py
@@ -485,9 +485,10 @@ def __new__(cls, origin, args):
def __repr__(self):
if len(self.__args__) == 2 and _is_param_expr(self.__args__[0]):
return super().__repr__()
+ from annotationlib import type_repr
return (f'collections.abc.Callable'
- f'[[{", ".join([_type_repr(a) for a in self.__args__[:-1]])}], '
- f'{_type_repr(self.__args__[-1])}]')
+ f'[[{", ".join([type_repr(a) for a in self.__args__[:-1]])}], '
+ f'{type_repr(self.__args__[-1])}]')
def __reduce__(self):
args = self.__args__
@@ -524,23 +525,6 @@ def _is_param_expr(obj):
names = ('ParamSpec', '_ConcatenateGenericAlias')
return obj.__module__ == 'typing' and any(obj.__name__ == name for name in names)
-def _type_repr(obj):
- """Return the repr() of an object, special-casing types (internal helper).
-
- Copied from :mod:`typing` since collections.abc
- shouldn't depend on that module.
- (Keep this roughly in sync with the typing version.)
- """
- if isinstance(obj, type):
- if obj.__module__ == 'builtins':
- return obj.__qualname__
- return f'{obj.__module__}.{obj.__qualname__}'
- if obj is Ellipsis:
- return '...'
- if isinstance(obj, FunctionType):
- return obj.__name__
- return repr(obj)
-
class Callable(metaclass=ABCMeta):
@@ -1073,6 +1057,7 @@ def count(self, value):
Sequence.register(tuple)
Sequence.register(str)
+Sequence.register(bytes)
Sequence.register(range)
Sequence.register(memoryview)
@@ -1083,7 +1068,7 @@ def __new__(cls, name, bases, namespace, **kwargs):
warnings._deprecated(
"collections.abc.ByteString",
- remove=(3, 14),
+ remove=(3, 17),
)
return super().__new__(cls, name, bases, namespace, **kwargs)
@@ -1092,14 +1077,18 @@ def __instancecheck__(cls, instance):
warnings._deprecated(
"collections.abc.ByteString",
- remove=(3, 14),
+ remove=(3, 17),
)
return super().__instancecheck__(instance)
class ByteString(Sequence, metaclass=_DeprecateByteStringMeta):
- """This unifies bytes and bytearray.
+ """Deprecated ABC serving as a common supertype of ``bytes`` and ``bytearray``.
- XXX Should add all their methods.
+ This ABC is scheduled for removal in Python 3.17.
+ Use ``isinstance(obj, collections.abc.Buffer)`` to test if ``obj``
+ implements the buffer protocol at runtime. For use in type annotations,
+ either use ``Buffer`` or a union that explicitly specifies the types your
+ code supports (e.g., ``bytes | bytearray | memoryview``).
"""
__slots__ = ()
@@ -1175,4 +1164,4 @@ def __iadd__(self, values):
MutableSequence.register(list)
-MutableSequence.register(bytearray) # Multiply inheriting, see ByteString
+MutableSequence.register(bytearray)
diff --git a/python/python3_14/examples/_colorize.py b/python/python3_14/examples/_colorize.py
new file mode 100644
index 0000000000..d6673f6692
--- /dev/null
+++ b/python/python3_14/examples/_colorize.py
@@ -0,0 +1,355 @@
+import os
+import sys
+
+from collections.abc import Callable, Iterator, Mapping
+from dataclasses import dataclass, field, Field
+
+COLORIZE = True
+
+
+# types
+if False:
+ from typing import IO, Self, ClassVar
+ _theme: Theme
+
+
+class ANSIColors:
+ RESET = "\x1b[0m"
+
+ BLACK = "\x1b[30m"
+ BLUE = "\x1b[34m"
+ CYAN = "\x1b[36m"
+ GREEN = "\x1b[32m"
+ GREY = "\x1b[90m"
+ MAGENTA = "\x1b[35m"
+ RED = "\x1b[31m"
+ WHITE = "\x1b[37m" # more like LIGHT GRAY
+ YELLOW = "\x1b[33m"
+
+ BOLD = "\x1b[1m"
+ BOLD_BLACK = "\x1b[1;30m" # DARK GRAY
+ BOLD_BLUE = "\x1b[1;34m"
+ BOLD_CYAN = "\x1b[1;36m"
+ BOLD_GREEN = "\x1b[1;32m"
+ BOLD_MAGENTA = "\x1b[1;35m"
+ BOLD_RED = "\x1b[1;31m"
+ BOLD_WHITE = "\x1b[1;37m" # actual WHITE
+ BOLD_YELLOW = "\x1b[1;33m"
+
+ # intense = like bold but without being bold
+ INTENSE_BLACK = "\x1b[90m"
+ INTENSE_BLUE = "\x1b[94m"
+ INTENSE_CYAN = "\x1b[96m"
+ INTENSE_GREEN = "\x1b[92m"
+ INTENSE_MAGENTA = "\x1b[95m"
+ INTENSE_RED = "\x1b[91m"
+ INTENSE_WHITE = "\x1b[97m"
+ INTENSE_YELLOW = "\x1b[93m"
+
+ BACKGROUND_BLACK = "\x1b[40m"
+ BACKGROUND_BLUE = "\x1b[44m"
+ BACKGROUND_CYAN = "\x1b[46m"
+ BACKGROUND_GREEN = "\x1b[42m"
+ BACKGROUND_MAGENTA = "\x1b[45m"
+ BACKGROUND_RED = "\x1b[41m"
+ BACKGROUND_WHITE = "\x1b[47m"
+ BACKGROUND_YELLOW = "\x1b[43m"
+
+ INTENSE_BACKGROUND_BLACK = "\x1b[100m"
+ INTENSE_BACKGROUND_BLUE = "\x1b[104m"
+ INTENSE_BACKGROUND_CYAN = "\x1b[106m"
+ INTENSE_BACKGROUND_GREEN = "\x1b[102m"
+ INTENSE_BACKGROUND_MAGENTA = "\x1b[105m"
+ INTENSE_BACKGROUND_RED = "\x1b[101m"
+ INTENSE_BACKGROUND_WHITE = "\x1b[107m"
+ INTENSE_BACKGROUND_YELLOW = "\x1b[103m"
+
+
+ColorCodes = set()
+NoColors = ANSIColors()
+
+for attr, code in ANSIColors.__dict__.items():
+ if not attr.startswith("__"):
+ ColorCodes.add(code)
+ setattr(NoColors, attr, "")
+
+
+#
+# Experimental theming support (see gh-133346)
+#
+
+# - Create a theme by copying an existing `Theme` with one or more sections
+# replaced, using `default_theme.copy_with()`;
+# - create a theme section by copying an existing `ThemeSection` with one or
+# more colors replaced, using for example `default_theme.syntax.copy_with()`;
+# - create a theme from scratch by instantiating a `Theme` data class with
+# the required sections (which are also dataclass instances).
+#
+# Then call `_colorize.set_theme(your_theme)` to set it.
+#
+# Put your theme configuration in $PYTHONSTARTUP for the interactive shell,
+# or sitecustomize.py in your virtual environment or Python installation for
+# other uses. Your applications can call `_colorize.set_theme()` too.
+#
+# Note that thanks to the dataclasses providing default values for all fields,
+# creating a new theme or theme section from scratch is possible without
+# specifying all keys.
+#
+# For example, here's a theme that makes punctuation and operators less prominent:
+#
+# try:
+# from _colorize import set_theme, default_theme, Syntax, ANSIColors
+# except ImportError:
+# pass
+# else:
+# theme_with_dim_operators = default_theme.copy_with(
+# syntax=Syntax(op=ANSIColors.INTENSE_BLACK),
+# )
+# set_theme(theme_with_dim_operators)
+# del set_theme, default_theme, Syntax, ANSIColors, theme_with_dim_operators
+#
+# Guarding the import ensures that your .pythonstartup file will still work in
+# Python 3.13 and older. Deleting the variables ensures they don't remain in your
+# interactive shell's global scope.
+
+class ThemeSection(Mapping[str, str]):
+ """A mixin/base class for theme sections.
+
+ It enables dictionary access to a section, as well as implements convenience
+ methods.
+ """
+
+ # The two types below are just that: types to inform the type checker that the
+ # mixin will work in context of those fields existing
+ __dataclass_fields__: ClassVar[dict[str, Field[str]]]
+ _name_to_value: Callable[[str], str]
+
+ def __post_init__(self) -> None:
+ name_to_value = {}
+ for color_name in self.__dataclass_fields__:
+ name_to_value[color_name] = getattr(self, color_name)
+ super().__setattr__('_name_to_value', name_to_value.__getitem__)
+
+ def copy_with(self, **kwargs: str) -> Self:
+ color_state: dict[str, str] = {}
+ for color_name in self.__dataclass_fields__:
+ color_state[color_name] = getattr(self, color_name)
+ color_state.update(kwargs)
+ return type(self)(**color_state)
+
+ @classmethod
+ def no_colors(cls) -> Self:
+ color_state: dict[str, str] = {}
+ for color_name in cls.__dataclass_fields__:
+ color_state[color_name] = ""
+ return cls(**color_state)
+
+ def __getitem__(self, key: str) -> str:
+ return self._name_to_value(key)
+
+ def __len__(self) -> int:
+ return len(self.__dataclass_fields__)
+
+ def __iter__(self) -> Iterator[str]:
+ return iter(self.__dataclass_fields__)
+
+
+@dataclass(frozen=True, kw_only=True)
+class Argparse(ThemeSection):
+ usage: str = ANSIColors.BOLD_BLUE
+ prog: str = ANSIColors.BOLD_MAGENTA
+ prog_extra: str = ANSIColors.MAGENTA
+ heading: str = ANSIColors.BOLD_BLUE
+ summary_long_option: str = ANSIColors.CYAN
+ summary_short_option: str = ANSIColors.GREEN
+ summary_label: str = ANSIColors.YELLOW
+ summary_action: str = ANSIColors.GREEN
+ long_option: str = ANSIColors.BOLD_CYAN
+ short_option: str = ANSIColors.BOLD_GREEN
+ label: str = ANSIColors.BOLD_YELLOW
+ action: str = ANSIColors.BOLD_GREEN
+ reset: str = ANSIColors.RESET
+
+
+@dataclass(frozen=True)
+class Syntax(ThemeSection):
+ prompt: str = ANSIColors.BOLD_MAGENTA
+ keyword: str = ANSIColors.BOLD_BLUE
+ keyword_constant: str = ANSIColors.BOLD_BLUE
+ builtin: str = ANSIColors.CYAN
+ comment: str = ANSIColors.RED
+ string: str = ANSIColors.GREEN
+ number: str = ANSIColors.YELLOW
+ op: str = ANSIColors.RESET
+ definition: str = ANSIColors.BOLD
+ soft_keyword: str = ANSIColors.BOLD_BLUE
+ reset: str = ANSIColors.RESET
+
+
+@dataclass(frozen=True)
+class Traceback(ThemeSection):
+ type: str = ANSIColors.BOLD_MAGENTA
+ message: str = ANSIColors.MAGENTA
+ filename: str = ANSIColors.MAGENTA
+ line_no: str = ANSIColors.MAGENTA
+ frame: str = ANSIColors.MAGENTA
+ error_highlight: str = ANSIColors.BOLD_RED
+ error_range: str = ANSIColors.RED
+ reset: str = ANSIColors.RESET
+
+
+@dataclass(frozen=True)
+class Unittest(ThemeSection):
+ passed: str = ANSIColors.GREEN
+ warn: str = ANSIColors.YELLOW
+ fail: str = ANSIColors.RED
+ fail_info: str = ANSIColors.BOLD_RED
+ reset: str = ANSIColors.RESET
+
+
+@dataclass(frozen=True)
+class Theme:
+ """A suite of themes for all sections of Python.
+
+ When adding a new one, remember to also modify `copy_with` and `no_colors`
+ below.
+ """
+ argparse: Argparse = field(default_factory=Argparse)
+ syntax: Syntax = field(default_factory=Syntax)
+ traceback: Traceback = field(default_factory=Traceback)
+ unittest: Unittest = field(default_factory=Unittest)
+
+ def copy_with(
+ self,
+ *,
+ argparse: Argparse | None = None,
+ syntax: Syntax | None = None,
+ traceback: Traceback | None = None,
+ unittest: Unittest | None = None,
+ ) -> Self:
+ """Return a new Theme based on this instance with some sections replaced.
+
+ Themes are immutable to protect against accidental modifications that
+ could lead to invalid terminal states.
+ """
+ return type(self)(
+ argparse=argparse or self.argparse,
+ syntax=syntax or self.syntax,
+ traceback=traceback or self.traceback,
+ unittest=unittest or self.unittest,
+ )
+
+ @classmethod
+ def no_colors(cls) -> Self:
+ """Return a new Theme where colors in all sections are empty strings.
+
+ This allows writing user code as if colors are always used. The color
+ fields will be ANSI color code strings when colorization is desired
+ and possible, and empty strings otherwise.
+ """
+ return cls(
+ argparse=Argparse.no_colors(),
+ syntax=Syntax.no_colors(),
+ traceback=Traceback.no_colors(),
+ unittest=Unittest.no_colors(),
+ )
+
+
+def get_colors(
+ colorize: bool = False, *, file: IO[str] | IO[bytes] | None = None
+) -> ANSIColors:
+ if colorize or can_colorize(file=file):
+ return ANSIColors()
+ else:
+ return NoColors
+
+
+def decolor(text: str) -> str:
+ """Remove ANSI color codes from a string."""
+ for code in ColorCodes:
+ text = text.replace(code, "")
+ return text
+
+
+def can_colorize(*, file: IO[str] | IO[bytes] | None = None) -> bool:
+
+ def _safe_getenv(k: str, fallback: str | None = None) -> str | None:
+ """Exception-safe environment retrieval. See gh-128636."""
+ try:
+ return os.environ.get(k, fallback)
+ except Exception:
+ return fallback
+
+ if file is None:
+ file = sys.stdout
+
+ if not sys.flags.ignore_environment:
+ if _safe_getenv("PYTHON_COLORS") == "0":
+ return False
+ if _safe_getenv("PYTHON_COLORS") == "1":
+ return True
+ if _safe_getenv("NO_COLOR"):
+ return False
+ if not COLORIZE:
+ return False
+ if _safe_getenv("FORCE_COLOR"):
+ return True
+ if _safe_getenv("TERM") == "dumb":
+ return False
+
+ if not hasattr(file, "fileno"):
+ return False
+
+ if sys.platform == "win32":
+ try:
+ import nt
+
+ if not nt._supports_virtual_terminal():
+ return False
+ except (ImportError, AttributeError):
+ return False
+
+ try:
+ return os.isatty(file.fileno())
+ except OSError:
+ return hasattr(file, "isatty") and file.isatty()
+
+
+default_theme = Theme()
+theme_no_color = default_theme.no_colors()
+
+
+def get_theme(
+ *,
+ tty_file: IO[str] | IO[bytes] | None = None,
+ force_color: bool = False,
+ force_no_color: bool = False,
+) -> Theme:
+ """Returns the currently set theme, potentially in a zero-color variant.
+
+ In cases where colorizing is not possible (see `can_colorize`), the returned
+ theme contains all empty strings in all color definitions.
+ See `Theme.no_colors()` for more information.
+
+ It is recommended not to cache the result of this function for extended
+ periods of time because the user might influence theme selection by
+ the interactive shell, a debugger, or application-specific code. The
+ environment (including environment variable state and console configuration
+ on Windows) can also change in the course of the application life cycle.
+ """
+ if force_color or (not force_no_color and
+ can_colorize(file=tty_file)):
+ return _theme
+ return theme_no_color
+
+
+def set_theme(t: Theme) -> None:
+ global _theme
+
+ if not isinstance(t, Theme):
+ raise ValueError(f"Expected Theme object, found {t}")
+
+ _theme = t
+
+
+set_theme(default_theme)
diff --git a/python/python3_13/examples/_compat_pickle.py b/python/python3_14/examples/_compat_pickle.py
similarity index 100%
rename from python/python3_13/examples/_compat_pickle.py
rename to python/python3_14/examples/_compat_pickle.py
diff --git a/python/python3_13/examples/_ios_support.py b/python/python3_14/examples/_ios_support.py
similarity index 100%
rename from python/python3_13/examples/_ios_support.py
rename to python/python3_14/examples/_ios_support.py
diff --git a/python/python3_13/examples/_markupbase.py b/python/python3_14/examples/_markupbase.py
similarity index 99%
rename from python/python3_13/examples/_markupbase.py
rename to python/python3_14/examples/_markupbase.py
index 3ad7e27996..614f0cd16d 100644
--- a/python/python3_13/examples/_markupbase.py
+++ b/python/python3_14/examples/_markupbase.py
@@ -13,7 +13,7 @@
_markedsectionclose = re.compile(r']\s*]\s*>')
# An analysis of the MS-Word extensions is available at
-# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
+# http://web.archive.org/web/20060321153828/http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
_msmarkedsectionclose = re.compile(r']\s*>')
diff --git a/python/python3_14/examples/_opcode_metadata.py b/python/python3_14/examples/_opcode_metadata.py
new file mode 100644
index 0000000000..b9304ec3c0
--- /dev/null
+++ b/python/python3_14/examples/_opcode_metadata.py
@@ -0,0 +1,371 @@
+# This file is generated by Tools/cases_generator/py_metadata_generator.py
+# from:
+# Python/bytecodes.c
+# Do not edit!
+_specializations = {
+ "RESUME": [
+ "RESUME_CHECK",
+ ],
+ "LOAD_CONST": [
+ "LOAD_CONST_MORTAL",
+ "LOAD_CONST_IMMORTAL",
+ ],
+ "TO_BOOL": [
+ "TO_BOOL_ALWAYS_TRUE",
+ "TO_BOOL_BOOL",
+ "TO_BOOL_INT",
+ "TO_BOOL_LIST",
+ "TO_BOOL_NONE",
+ "TO_BOOL_STR",
+ ],
+ "BINARY_OP": [
+ "BINARY_OP_MULTIPLY_INT",
+ "BINARY_OP_ADD_INT",
+ "BINARY_OP_SUBTRACT_INT",
+ "BINARY_OP_MULTIPLY_FLOAT",
+ "BINARY_OP_ADD_FLOAT",
+ "BINARY_OP_SUBTRACT_FLOAT",
+ "BINARY_OP_ADD_UNICODE",
+ "BINARY_OP_SUBSCR_LIST_INT",
+ "BINARY_OP_SUBSCR_LIST_SLICE",
+ "BINARY_OP_SUBSCR_TUPLE_INT",
+ "BINARY_OP_SUBSCR_STR_INT",
+ "BINARY_OP_SUBSCR_DICT",
+ "BINARY_OP_SUBSCR_GETITEM",
+ "BINARY_OP_EXTEND",
+ "BINARY_OP_INPLACE_ADD_UNICODE",
+ ],
+ "STORE_SUBSCR": [
+ "STORE_SUBSCR_DICT",
+ "STORE_SUBSCR_LIST_INT",
+ ],
+ "SEND": [
+ "SEND_GEN",
+ ],
+ "UNPACK_SEQUENCE": [
+ "UNPACK_SEQUENCE_TWO_TUPLE",
+ "UNPACK_SEQUENCE_TUPLE",
+ "UNPACK_SEQUENCE_LIST",
+ ],
+ "STORE_ATTR": [
+ "STORE_ATTR_INSTANCE_VALUE",
+ "STORE_ATTR_SLOT",
+ "STORE_ATTR_WITH_HINT",
+ ],
+ "LOAD_GLOBAL": [
+ "LOAD_GLOBAL_MODULE",
+ "LOAD_GLOBAL_BUILTIN",
+ ],
+ "LOAD_SUPER_ATTR": [
+ "LOAD_SUPER_ATTR_ATTR",
+ "LOAD_SUPER_ATTR_METHOD",
+ ],
+ "LOAD_ATTR": [
+ "LOAD_ATTR_INSTANCE_VALUE",
+ "LOAD_ATTR_MODULE",
+ "LOAD_ATTR_WITH_HINT",
+ "LOAD_ATTR_SLOT",
+ "LOAD_ATTR_CLASS",
+ "LOAD_ATTR_CLASS_WITH_METACLASS_CHECK",
+ "LOAD_ATTR_PROPERTY",
+ "LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN",
+ "LOAD_ATTR_METHOD_WITH_VALUES",
+ "LOAD_ATTR_METHOD_NO_DICT",
+ "LOAD_ATTR_METHOD_LAZY_DICT",
+ "LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES",
+ "LOAD_ATTR_NONDESCRIPTOR_NO_DICT",
+ ],
+ "COMPARE_OP": [
+ "COMPARE_OP_FLOAT",
+ "COMPARE_OP_INT",
+ "COMPARE_OP_STR",
+ ],
+ "CONTAINS_OP": [
+ "CONTAINS_OP_SET",
+ "CONTAINS_OP_DICT",
+ ],
+ "JUMP_BACKWARD": [
+ "JUMP_BACKWARD_NO_JIT",
+ "JUMP_BACKWARD_JIT",
+ ],
+ "FOR_ITER": [
+ "FOR_ITER_LIST",
+ "FOR_ITER_TUPLE",
+ "FOR_ITER_RANGE",
+ "FOR_ITER_GEN",
+ ],
+ "CALL": [
+ "CALL_BOUND_METHOD_EXACT_ARGS",
+ "CALL_PY_EXACT_ARGS",
+ "CALL_TYPE_1",
+ "CALL_STR_1",
+ "CALL_TUPLE_1",
+ "CALL_BUILTIN_CLASS",
+ "CALL_BUILTIN_O",
+ "CALL_BUILTIN_FAST",
+ "CALL_BUILTIN_FAST_WITH_KEYWORDS",
+ "CALL_LEN",
+ "CALL_ISINSTANCE",
+ "CALL_LIST_APPEND",
+ "CALL_METHOD_DESCRIPTOR_O",
+ "CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS",
+ "CALL_METHOD_DESCRIPTOR_NOARGS",
+ "CALL_METHOD_DESCRIPTOR_FAST",
+ "CALL_ALLOC_AND_ENTER_INIT",
+ "CALL_PY_GENERAL",
+ "CALL_BOUND_METHOD_GENERAL",
+ "CALL_NON_PY_GENERAL",
+ ],
+ "CALL_KW": [
+ "CALL_KW_BOUND_METHOD",
+ "CALL_KW_PY",
+ "CALL_KW_NON_PY",
+ ],
+}
+
+_specialized_opmap = {
+ 'BINARY_OP_ADD_FLOAT': 129,
+ 'BINARY_OP_ADD_INT': 130,
+ 'BINARY_OP_ADD_UNICODE': 131,
+ 'BINARY_OP_EXTEND': 132,
+ 'BINARY_OP_INPLACE_ADD_UNICODE': 3,
+ 'BINARY_OP_MULTIPLY_FLOAT': 133,
+ 'BINARY_OP_MULTIPLY_INT': 134,
+ 'BINARY_OP_SUBSCR_DICT': 135,
+ 'BINARY_OP_SUBSCR_GETITEM': 136,
+ 'BINARY_OP_SUBSCR_LIST_INT': 137,
+ 'BINARY_OP_SUBSCR_LIST_SLICE': 138,
+ 'BINARY_OP_SUBSCR_STR_INT': 139,
+ 'BINARY_OP_SUBSCR_TUPLE_INT': 140,
+ 'BINARY_OP_SUBTRACT_FLOAT': 141,
+ 'BINARY_OP_SUBTRACT_INT': 142,
+ 'CALL_ALLOC_AND_ENTER_INIT': 143,
+ 'CALL_BOUND_METHOD_EXACT_ARGS': 144,
+ 'CALL_BOUND_METHOD_GENERAL': 145,
+ 'CALL_BUILTIN_CLASS': 146,
+ 'CALL_BUILTIN_FAST': 147,
+ 'CALL_BUILTIN_FAST_WITH_KEYWORDS': 148,
+ 'CALL_BUILTIN_O': 149,
+ 'CALL_ISINSTANCE': 150,
+ 'CALL_KW_BOUND_METHOD': 151,
+ 'CALL_KW_NON_PY': 152,
+ 'CALL_KW_PY': 153,
+ 'CALL_LEN': 154,
+ 'CALL_LIST_APPEND': 155,
+ 'CALL_METHOD_DESCRIPTOR_FAST': 156,
+ 'CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS': 157,
+ 'CALL_METHOD_DESCRIPTOR_NOARGS': 158,
+ 'CALL_METHOD_DESCRIPTOR_O': 159,
+ 'CALL_NON_PY_GENERAL': 160,
+ 'CALL_PY_EXACT_ARGS': 161,
+ 'CALL_PY_GENERAL': 162,
+ 'CALL_STR_1': 163,
+ 'CALL_TUPLE_1': 164,
+ 'CALL_TYPE_1': 165,
+ 'COMPARE_OP_FLOAT': 166,
+ 'COMPARE_OP_INT': 167,
+ 'COMPARE_OP_STR': 168,
+ 'CONTAINS_OP_DICT': 169,
+ 'CONTAINS_OP_SET': 170,
+ 'FOR_ITER_GEN': 171,
+ 'FOR_ITER_LIST': 172,
+ 'FOR_ITER_RANGE': 173,
+ 'FOR_ITER_TUPLE': 174,
+ 'JUMP_BACKWARD_JIT': 175,
+ 'JUMP_BACKWARD_NO_JIT': 176,
+ 'LOAD_ATTR_CLASS': 177,
+ 'LOAD_ATTR_CLASS_WITH_METACLASS_CHECK': 178,
+ 'LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN': 179,
+ 'LOAD_ATTR_INSTANCE_VALUE': 180,
+ 'LOAD_ATTR_METHOD_LAZY_DICT': 181,
+ 'LOAD_ATTR_METHOD_NO_DICT': 182,
+ 'LOAD_ATTR_METHOD_WITH_VALUES': 183,
+ 'LOAD_ATTR_MODULE': 184,
+ 'LOAD_ATTR_NONDESCRIPTOR_NO_DICT': 185,
+ 'LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES': 186,
+ 'LOAD_ATTR_PROPERTY': 187,
+ 'LOAD_ATTR_SLOT': 188,
+ 'LOAD_ATTR_WITH_HINT': 189,
+ 'LOAD_CONST_IMMORTAL': 190,
+ 'LOAD_CONST_MORTAL': 191,
+ 'LOAD_GLOBAL_BUILTIN': 192,
+ 'LOAD_GLOBAL_MODULE': 193,
+ 'LOAD_SUPER_ATTR_ATTR': 194,
+ 'LOAD_SUPER_ATTR_METHOD': 195,
+ 'RESUME_CHECK': 196,
+ 'SEND_GEN': 197,
+ 'STORE_ATTR_INSTANCE_VALUE': 198,
+ 'STORE_ATTR_SLOT': 199,
+ 'STORE_ATTR_WITH_HINT': 200,
+ 'STORE_SUBSCR_DICT': 201,
+ 'STORE_SUBSCR_LIST_INT': 202,
+ 'TO_BOOL_ALWAYS_TRUE': 203,
+ 'TO_BOOL_BOOL': 204,
+ 'TO_BOOL_INT': 205,
+ 'TO_BOOL_LIST': 206,
+ 'TO_BOOL_NONE': 207,
+ 'TO_BOOL_STR': 208,
+ 'UNPACK_SEQUENCE_LIST': 209,
+ 'UNPACK_SEQUENCE_TUPLE': 210,
+ 'UNPACK_SEQUENCE_TWO_TUPLE': 211,
+}
+
+opmap = {
+ 'CACHE': 0,
+ 'RESERVED': 17,
+ 'RESUME': 128,
+ 'INSTRUMENTED_LINE': 254,
+ 'ENTER_EXECUTOR': 255,
+ 'BINARY_SLICE': 1,
+ 'BUILD_TEMPLATE': 2,
+ 'CALL_FUNCTION_EX': 4,
+ 'CHECK_EG_MATCH': 5,
+ 'CHECK_EXC_MATCH': 6,
+ 'CLEANUP_THROW': 7,
+ 'DELETE_SUBSCR': 8,
+ 'END_FOR': 9,
+ 'END_SEND': 10,
+ 'EXIT_INIT_CHECK': 11,
+ 'FORMAT_SIMPLE': 12,
+ 'FORMAT_WITH_SPEC': 13,
+ 'GET_AITER': 14,
+ 'GET_ANEXT': 15,
+ 'GET_ITER': 16,
+ 'GET_LEN': 18,
+ 'GET_YIELD_FROM_ITER': 19,
+ 'INTERPRETER_EXIT': 20,
+ 'LOAD_BUILD_CLASS': 21,
+ 'LOAD_LOCALS': 22,
+ 'MAKE_FUNCTION': 23,
+ 'MATCH_KEYS': 24,
+ 'MATCH_MAPPING': 25,
+ 'MATCH_SEQUENCE': 26,
+ 'NOP': 27,
+ 'NOT_TAKEN': 28,
+ 'POP_EXCEPT': 29,
+ 'POP_ITER': 30,
+ 'POP_TOP': 31,
+ 'PUSH_EXC_INFO': 32,
+ 'PUSH_NULL': 33,
+ 'RETURN_GENERATOR': 34,
+ 'RETURN_VALUE': 35,
+ 'SETUP_ANNOTATIONS': 36,
+ 'STORE_SLICE': 37,
+ 'STORE_SUBSCR': 38,
+ 'TO_BOOL': 39,
+ 'UNARY_INVERT': 40,
+ 'UNARY_NEGATIVE': 41,
+ 'UNARY_NOT': 42,
+ 'WITH_EXCEPT_START': 43,
+ 'BINARY_OP': 44,
+ 'BUILD_INTERPOLATION': 45,
+ 'BUILD_LIST': 46,
+ 'BUILD_MAP': 47,
+ 'BUILD_SET': 48,
+ 'BUILD_SLICE': 49,
+ 'BUILD_STRING': 50,
+ 'BUILD_TUPLE': 51,
+ 'CALL': 52,
+ 'CALL_INTRINSIC_1': 53,
+ 'CALL_INTRINSIC_2': 54,
+ 'CALL_KW': 55,
+ 'COMPARE_OP': 56,
+ 'CONTAINS_OP': 57,
+ 'CONVERT_VALUE': 58,
+ 'COPY': 59,
+ 'COPY_FREE_VARS': 60,
+ 'DELETE_ATTR': 61,
+ 'DELETE_DEREF': 62,
+ 'DELETE_FAST': 63,
+ 'DELETE_GLOBAL': 64,
+ 'DELETE_NAME': 65,
+ 'DICT_MERGE': 66,
+ 'DICT_UPDATE': 67,
+ 'END_ASYNC_FOR': 68,
+ 'EXTENDED_ARG': 69,
+ 'FOR_ITER': 70,
+ 'GET_AWAITABLE': 71,
+ 'IMPORT_FROM': 72,
+ 'IMPORT_NAME': 73,
+ 'IS_OP': 74,
+ 'JUMP_BACKWARD': 75,
+ 'JUMP_BACKWARD_NO_INTERRUPT': 76,
+ 'JUMP_FORWARD': 77,
+ 'LIST_APPEND': 78,
+ 'LIST_EXTEND': 79,
+ 'LOAD_ATTR': 80,
+ 'LOAD_COMMON_CONSTANT': 81,
+ 'LOAD_CONST': 82,
+ 'LOAD_DEREF': 83,
+ 'LOAD_FAST': 84,
+ 'LOAD_FAST_AND_CLEAR': 85,
+ 'LOAD_FAST_BORROW': 86,
+ 'LOAD_FAST_BORROW_LOAD_FAST_BORROW': 87,
+ 'LOAD_FAST_CHECK': 88,
+ 'LOAD_FAST_LOAD_FAST': 89,
+ 'LOAD_FROM_DICT_OR_DEREF': 90,
+ 'LOAD_FROM_DICT_OR_GLOBALS': 91,
+ 'LOAD_GLOBAL': 92,
+ 'LOAD_NAME': 93,
+ 'LOAD_SMALL_INT': 94,
+ 'LOAD_SPECIAL': 95,
+ 'LOAD_SUPER_ATTR': 96,
+ 'MAKE_CELL': 97,
+ 'MAP_ADD': 98,
+ 'MATCH_CLASS': 99,
+ 'POP_JUMP_IF_FALSE': 100,
+ 'POP_JUMP_IF_NONE': 101,
+ 'POP_JUMP_IF_NOT_NONE': 102,
+ 'POP_JUMP_IF_TRUE': 103,
+ 'RAISE_VARARGS': 104,
+ 'RERAISE': 105,
+ 'SEND': 106,
+ 'SET_ADD': 107,
+ 'SET_FUNCTION_ATTRIBUTE': 108,
+ 'SET_UPDATE': 109,
+ 'STORE_ATTR': 110,
+ 'STORE_DEREF': 111,
+ 'STORE_FAST': 112,
+ 'STORE_FAST_LOAD_FAST': 113,
+ 'STORE_FAST_STORE_FAST': 114,
+ 'STORE_GLOBAL': 115,
+ 'STORE_NAME': 116,
+ 'SWAP': 117,
+ 'UNPACK_EX': 118,
+ 'UNPACK_SEQUENCE': 119,
+ 'YIELD_VALUE': 120,
+ 'INSTRUMENTED_END_FOR': 234,
+ 'INSTRUMENTED_POP_ITER': 235,
+ 'INSTRUMENTED_END_SEND': 236,
+ 'INSTRUMENTED_FOR_ITER': 237,
+ 'INSTRUMENTED_INSTRUCTION': 238,
+ 'INSTRUMENTED_JUMP_FORWARD': 239,
+ 'INSTRUMENTED_NOT_TAKEN': 240,
+ 'INSTRUMENTED_POP_JUMP_IF_TRUE': 241,
+ 'INSTRUMENTED_POP_JUMP_IF_FALSE': 242,
+ 'INSTRUMENTED_POP_JUMP_IF_NONE': 243,
+ 'INSTRUMENTED_POP_JUMP_IF_NOT_NONE': 244,
+ 'INSTRUMENTED_RESUME': 245,
+ 'INSTRUMENTED_RETURN_VALUE': 246,
+ 'INSTRUMENTED_YIELD_VALUE': 247,
+ 'INSTRUMENTED_END_ASYNC_FOR': 248,
+ 'INSTRUMENTED_LOAD_SUPER_ATTR': 249,
+ 'INSTRUMENTED_CALL': 250,
+ 'INSTRUMENTED_CALL_KW': 251,
+ 'INSTRUMENTED_CALL_FUNCTION_EX': 252,
+ 'INSTRUMENTED_JUMP_BACKWARD': 253,
+ 'ANNOTATIONS_PLACEHOLDER': 256,
+ 'JUMP': 257,
+ 'JUMP_IF_FALSE': 258,
+ 'JUMP_IF_TRUE': 259,
+ 'JUMP_NO_INTERRUPT': 260,
+ 'LOAD_CLOSURE': 261,
+ 'POP_BLOCK': 262,
+ 'SETUP_CLEANUP': 263,
+ 'SETUP_FINALLY': 264,
+ 'SETUP_WITH': 265,
+ 'STORE_FAST_MAYBE_NULL': 266,
+}
+
+HAVE_ARGUMENT = 43
+MIN_INSTRUMENTED_OPCODE = 234
diff --git a/python/python3_13/examples/_osx_support.py b/python/python3_14/examples/_osx_support.py
similarity index 100%
rename from python/python3_13/examples/_osx_support.py
rename to python/python3_14/examples/_osx_support.py
diff --git a/python/python3_13/examples/_py_abc.py b/python/python3_14/examples/_py_abc.py
similarity index 100%
rename from python/python3_13/examples/_py_abc.py
rename to python/python3_14/examples/_py_abc.py
diff --git a/python/python3_14/examples/_py_warnings.py b/python/python3_14/examples/_py_warnings.py
new file mode 100644
index 0000000000..55f8c06959
--- /dev/null
+++ b/python/python3_14/examples/_py_warnings.py
@@ -0,0 +1,869 @@
+"""Python part of the warnings subsystem."""
+
+import sys
+import _contextvars
+import _thread
+
+
+__all__ = ["warn", "warn_explicit", "showwarning",
+ "formatwarning", "filterwarnings", "simplefilter",
+ "resetwarnings", "catch_warnings", "deprecated"]
+
+
+# Normally '_wm' is sys.modules['warnings'] but for unit tests it can be
+# a different module. User code is allowed to reassign global attributes
+# of the 'warnings' module, commonly 'filters' or 'showwarning'. So we
+# need to lookup these global attributes dynamically on the '_wm' object,
+# rather than binding them earlier. The code in this module consistently uses
+# '_wm.' rather than using the globals of this module. If the
+# '_warnings' C extension is in use, some globals are replaced by functions
+# and variables defined in that extension.
+_wm = None
+
+
+def _set_module(module):
+ global _wm
+ _wm = module
+
+
+# filters contains a sequence of filter 5-tuples
+# The components of the 5-tuple are:
+# - an action: error, ignore, always, all, default, module, or once
+# - a compiled regex that must match the warning message
+# - a class representing the warning category
+# - a compiled regex that must match the module that is being warned
+# - a line number for the line being warning, or 0 to mean any line
+# If either if the compiled regexs are None, match anything.
+filters = []
+
+
+defaultaction = "default"
+onceregistry = {}
+_lock = _thread.RLock()
+_filters_version = 1
+
+
+# If true, catch_warnings() will use a context var to hold the modified
+# filters list. Otherwise, catch_warnings() will operate on the 'filters'
+# global of the warnings module.
+_use_context = sys.flags.context_aware_warnings
+
+
+class _Context:
+ def __init__(self, filters):
+ self._filters = filters
+ self.log = None # if set to a list, logging is enabled
+
+ def copy(self):
+ context = _Context(self._filters[:])
+ if self.log is not None:
+ context.log = self.log
+ return context
+
+ def _record_warning(self, msg):
+ self.log.append(msg)
+
+
+class _GlobalContext(_Context):
+ def __init__(self):
+ self.log = None
+
+ @property
+ def _filters(self):
+ # Since there is quite a lot of code that assigns to
+ # warnings.filters, this needs to return the current value of
+ # the module global.
+ try:
+ return _wm.filters
+ except AttributeError:
+ # 'filters' global was deleted. Do we need to actually handle this case?
+ return []
+
+
+_global_context = _GlobalContext()
+
+
+_warnings_context = _contextvars.ContextVar('warnings_context')
+
+
+def _get_context():
+ if not _use_context:
+ return _global_context
+ try:
+ return _wm._warnings_context.get()
+ except LookupError:
+ return _global_context
+
+
+def _set_context(context):
+ assert _use_context
+ _wm._warnings_context.set(context)
+
+
+def _new_context():
+ assert _use_context
+ old_context = _wm._get_context()
+ new_context = old_context.copy()
+ _wm._set_context(new_context)
+ return old_context, new_context
+
+
+def _get_filters():
+ """Return the current list of filters. This is a non-public API used by
+ module functions and by the unit tests."""
+ return _wm._get_context()._filters
+
+
+def _filters_mutated_lock_held():
+ _wm._filters_version += 1
+
+
+def showwarning(message, category, filename, lineno, file=None, line=None):
+ """Hook to write a warning to a file; replace if you like."""
+ msg = _wm.WarningMessage(message, category, filename, lineno, file, line)
+ _wm._showwarnmsg_impl(msg)
+
+
+def formatwarning(message, category, filename, lineno, line=None):
+ """Function to format a warning the standard way."""
+ msg = _wm.WarningMessage(message, category, filename, lineno, None, line)
+ return _wm._formatwarnmsg_impl(msg)
+
+
+def _showwarnmsg_impl(msg):
+ context = _wm._get_context()
+ if context.log is not None:
+ context._record_warning(msg)
+ return
+ file = msg.file
+ if file is None:
+ file = sys.stderr
+ if file is None:
+ # sys.stderr is None when run with pythonw.exe:
+ # warnings get lost
+ return
+ text = _wm._formatwarnmsg(msg)
+ try:
+ file.write(text)
+ except OSError:
+ # the file (probably stderr) is invalid - this warning gets lost.
+ pass
+
+
+def _formatwarnmsg_impl(msg):
+ category = msg.category.__name__
+ s = f"{msg.filename}:{msg.lineno}: {category}: {msg.message}\n"
+
+ if msg.line is None:
+ try:
+ import linecache
+ line = linecache.getline(msg.filename, msg.lineno)
+ except Exception:
+ # When a warning is logged during Python shutdown, linecache
+ # and the import machinery don't work anymore
+ line = None
+ linecache = None
+ else:
+ line = msg.line
+ if line:
+ line = line.strip()
+ s += " %s\n" % line
+
+ if msg.source is not None:
+ try:
+ import tracemalloc
+ # Logging a warning should not raise a new exception:
+ # catch Exception, not only ImportError and RecursionError.
+ except Exception:
+ # don't suggest to enable tracemalloc if it's not available
+ suggest_tracemalloc = False
+ tb = None
+ else:
+ try:
+ suggest_tracemalloc = not tracemalloc.is_tracing()
+ tb = tracemalloc.get_object_traceback(msg.source)
+ except Exception:
+ # When a warning is logged during Python shutdown, tracemalloc
+ # and the import machinery don't work anymore
+ suggest_tracemalloc = False
+ tb = None
+
+ if tb is not None:
+ s += 'Object allocated at (most recent call last):\n'
+ for frame in tb:
+ s += (' File "%s", lineno %s\n'
+ % (frame.filename, frame.lineno))
+
+ try:
+ if linecache is not None:
+ line = linecache.getline(frame.filename, frame.lineno)
+ else:
+ line = None
+ except Exception:
+ line = None
+ if line:
+ line = line.strip()
+ s += ' %s\n' % line
+ elif suggest_tracemalloc:
+ s += (f'{category}: Enable tracemalloc to get the object '
+ f'allocation traceback\n')
+ return s
+
+
+# Keep a reference to check if the function was replaced
+_showwarning_orig = showwarning
+
+
+def _showwarnmsg(msg):
+ """Hook to write a warning to a file; replace if you like."""
+ try:
+ sw = _wm.showwarning
+ except AttributeError:
+ pass
+ else:
+ if sw is not _showwarning_orig:
+ # warnings.showwarning() was replaced
+ if not callable(sw):
+ raise TypeError("warnings.showwarning() must be set to a "
+ "function or method")
+
+ sw(msg.message, msg.category, msg.filename, msg.lineno,
+ msg.file, msg.line)
+ return
+ _wm._showwarnmsg_impl(msg)
+
+
+# Keep a reference to check if the function was replaced
+_formatwarning_orig = formatwarning
+
+
+def _formatwarnmsg(msg):
+ """Function to format a warning the standard way."""
+ try:
+ fw = _wm.formatwarning
+ except AttributeError:
+ pass
+ else:
+ if fw is not _formatwarning_orig:
+ # warnings.formatwarning() was replaced
+ return fw(msg.message, msg.category,
+ msg.filename, msg.lineno, msg.line)
+ return _wm._formatwarnmsg_impl(msg)
+
+
+def filterwarnings(action, message="", category=Warning, module="", lineno=0,
+ append=False):
+ """Insert an entry into the list of warnings filters (at the front).
+
+ 'action' -- one of "error", "ignore", "always", "all", "default", "module",
+ or "once"
+ 'message' -- a regex that the warning message must match
+ 'category' -- a class that the warning must be a subclass of
+ 'module' -- a regex that the module name must match
+ 'lineno' -- an integer line number, 0 matches all warnings
+ 'append' -- if true, append to the list of filters
+ """
+ if action not in {"error", "ignore", "always", "all", "default", "module", "once"}:
+ raise ValueError(f"invalid action: {action!r}")
+ if not isinstance(message, str):
+ raise TypeError("message must be a string")
+ if not isinstance(category, type) or not issubclass(category, Warning):
+ raise TypeError("category must be a Warning subclass")
+ if not isinstance(module, str):
+ raise TypeError("module must be a string")
+ if not isinstance(lineno, int):
+ raise TypeError("lineno must be an int")
+ if lineno < 0:
+ raise ValueError("lineno must be an int >= 0")
+
+ if message or module:
+ import re
+
+ if message:
+ message = re.compile(message, re.I)
+ else:
+ message = None
+ if module:
+ module = re.compile(module)
+ else:
+ module = None
+
+ _wm._add_filter(action, message, category, module, lineno, append=append)
+
+
+def simplefilter(action, category=Warning, lineno=0, append=False):
+ """Insert a simple entry into the list of warnings filters (at the front).
+
+ A simple filter matches all modules and messages.
+ 'action' -- one of "error", "ignore", "always", "all", "default", "module",
+ or "once"
+ 'category' -- a class that the warning must be a subclass of
+ 'lineno' -- an integer line number, 0 matches all warnings
+ 'append' -- if true, append to the list of filters
+ """
+ if action not in {"error", "ignore", "always", "all", "default", "module", "once"}:
+ raise ValueError(f"invalid action: {action!r}")
+ if not isinstance(lineno, int):
+ raise TypeError("lineno must be an int")
+ if lineno < 0:
+ raise ValueError("lineno must be an int >= 0")
+ _wm._add_filter(action, None, category, None, lineno, append=append)
+
+
+def _filters_mutated():
+ # Even though this function is not part of the public API, it's used by
+ # a fair amount of user code.
+ with _wm._lock:
+ _wm._filters_mutated_lock_held()
+
+
+def _add_filter(*item, append):
+ with _wm._lock:
+ filters = _wm._get_filters()
+ if not append:
+ # Remove possible duplicate filters, so new one will be placed
+ # in correct place. If append=True and duplicate exists, do nothing.
+ try:
+ filters.remove(item)
+ except ValueError:
+ pass
+ filters.insert(0, item)
+ else:
+ if item not in filters:
+ filters.append(item)
+ _wm._filters_mutated_lock_held()
+
+
+def resetwarnings():
+ """Clear the list of warning filters, so that no filters are active."""
+ with _wm._lock:
+ del _wm._get_filters()[:]
+ _wm._filters_mutated_lock_held()
+
+
+class _OptionError(Exception):
+ """Exception used by option processing helpers."""
+ pass
+
+
+# Helper to process -W options passed via sys.warnoptions
+def _processoptions(args):
+ for arg in args:
+ try:
+ _wm._setoption(arg)
+ except _wm._OptionError as msg:
+ print("Invalid -W option ignored:", msg, file=sys.stderr)
+
+
+# Helper for _processoptions()
+def _setoption(arg):
+ parts = arg.split(':')
+ if len(parts) > 5:
+ raise _wm._OptionError("too many fields (max 5): %r" % (arg,))
+ while len(parts) < 5:
+ parts.append('')
+ action, message, category, module, lineno = [s.strip()
+ for s in parts]
+ action = _wm._getaction(action)
+ category = _wm._getcategory(category)
+ if message or module:
+ import re
+ if message:
+ message = re.escape(message)
+ if module:
+ module = re.escape(module) + r'\z'
+ if lineno:
+ try:
+ lineno = int(lineno)
+ if lineno < 0:
+ raise ValueError
+ except (ValueError, OverflowError):
+ raise _wm._OptionError("invalid lineno %r" % (lineno,)) from None
+ else:
+ lineno = 0
+ _wm.filterwarnings(action, message, category, module, lineno)
+
+
+# Helper for _setoption()
+def _getaction(action):
+ if not action:
+ return "default"
+ for a in ('default', 'always', 'all', 'ignore', 'module', 'once', 'error'):
+ if a.startswith(action):
+ return a
+ raise _wm._OptionError("invalid action: %r" % (action,))
+
+
+# Helper for _setoption()
+def _getcategory(category):
+ if not category:
+ return Warning
+ if '.' not in category:
+ import builtins as m
+ klass = category
+ else:
+ module, _, klass = category.rpartition('.')
+ try:
+ m = __import__(module, None, None, [klass])
+ except ImportError:
+ raise _wm._OptionError("invalid module name: %r" % (module,)) from None
+ try:
+ cat = getattr(m, klass)
+ except AttributeError:
+ raise _wm._OptionError("unknown warning category: %r" % (category,)) from None
+ if not issubclass(cat, Warning):
+ raise _wm._OptionError("invalid warning category: %r" % (category,))
+ return cat
+
+
+def _is_internal_filename(filename):
+ return 'importlib' in filename and '_bootstrap' in filename
+
+
+def _is_filename_to_skip(filename, skip_file_prefixes):
+ return any(filename.startswith(prefix) for prefix in skip_file_prefixes)
+
+
+def _is_internal_frame(frame):
+ """Signal whether the frame is an internal CPython implementation detail."""
+ return _is_internal_filename(frame.f_code.co_filename)
+
+
+def _next_external_frame(frame, skip_file_prefixes):
+ """Find the next frame that doesn't involve Python or user internals."""
+ frame = frame.f_back
+ while frame is not None and (
+ _is_internal_filename(filename := frame.f_code.co_filename) or
+ _is_filename_to_skip(filename, skip_file_prefixes)):
+ frame = frame.f_back
+ return frame
+
+
+# Code typically replaced by _warnings
+def warn(message, category=None, stacklevel=1, source=None,
+ *, skip_file_prefixes=()):
+ """Issue a warning, or maybe ignore it or raise an exception."""
+ # Check if message is already a Warning object
+ if isinstance(message, Warning):
+ category = message.__class__
+ # Check category argument
+ if category is None:
+ category = UserWarning
+ if not (isinstance(category, type) and issubclass(category, Warning)):
+ raise TypeError("category must be a Warning subclass, "
+ "not '{:s}'".format(type(category).__name__))
+ if not isinstance(skip_file_prefixes, tuple):
+ # The C version demands a tuple for implementation performance.
+ raise TypeError('skip_file_prefixes must be a tuple of strs.')
+ if skip_file_prefixes:
+ stacklevel = max(2, stacklevel)
+ # Get context information
+ try:
+ if stacklevel <= 1 or _is_internal_frame(sys._getframe(1)):
+ # If frame is too small to care or if the warning originated in
+ # internal code, then do not try to hide any frames.
+ frame = sys._getframe(stacklevel)
+ else:
+ frame = sys._getframe(1)
+ # Look for one frame less since the above line starts us off.
+ for x in range(stacklevel-1):
+ frame = _next_external_frame(frame, skip_file_prefixes)
+ if frame is None:
+ raise ValueError
+ except ValueError:
+ globals = sys.__dict__
+ filename = ""
+ lineno = 0
+ else:
+ globals = frame.f_globals
+ filename = frame.f_code.co_filename
+ lineno = frame.f_lineno
+ if '__name__' in globals:
+ module = globals['__name__']
+ else:
+ module = ""
+ registry = globals.setdefault("__warningregistry__", {})
+ _wm.warn_explicit(
+ message,
+ category,
+ filename,
+ lineno,
+ module,
+ registry,
+ globals,
+ source=source,
+ )
+
+
+def warn_explicit(message, category, filename, lineno,
+ module=None, registry=None, module_globals=None,
+ source=None):
+ lineno = int(lineno)
+ if module is None:
+ module = filename or ""
+ if module[-3:].lower() == ".py":
+ module = module[:-3] # XXX What about leading pathname?
+ if isinstance(message, Warning):
+ text = str(message)
+ category = message.__class__
+ else:
+ text = message
+ message = category(message)
+ key = (text, category, lineno)
+ with _wm._lock:
+ if registry is None:
+ registry = {}
+ if registry.get('version', 0) != _wm._filters_version:
+ registry.clear()
+ registry['version'] = _wm._filters_version
+ # Quick test for common case
+ if registry.get(key):
+ return
+ # Search the filters
+ for item in _wm._get_filters():
+ action, msg, cat, mod, ln = item
+ if ((msg is None or msg.match(text)) and
+ issubclass(category, cat) and
+ (mod is None or mod.match(module)) and
+ (ln == 0 or lineno == ln)):
+ break
+ else:
+ action = _wm.defaultaction
+ # Early exit actions
+ if action == "ignore":
+ return
+
+ if action == "error":
+ raise message
+ # Other actions
+ if action == "once":
+ registry[key] = 1
+ oncekey = (text, category)
+ if _wm.onceregistry.get(oncekey):
+ return
+ _wm.onceregistry[oncekey] = 1
+ elif action in {"always", "all"}:
+ pass
+ elif action == "module":
+ registry[key] = 1
+ altkey = (text, category, 0)
+ if registry.get(altkey):
+ return
+ registry[altkey] = 1
+ elif action == "default":
+ registry[key] = 1
+ else:
+ # Unrecognized actions are errors
+ raise RuntimeError(
+ "Unrecognized action (%r) in warnings.filters:\n %s" %
+ (action, item))
+
+ # Prime the linecache for formatting, in case the
+ # "file" is actually in a zipfile or something.
+ import linecache
+ linecache.getlines(filename, module_globals)
+
+ # Print message and context
+ msg = _wm.WarningMessage(message, category, filename, lineno, source=source)
+ _wm._showwarnmsg(msg)
+
+
+class WarningMessage(object):
+
+ _WARNING_DETAILS = ("message", "category", "filename", "lineno", "file",
+ "line", "source")
+
+ def __init__(self, message, category, filename, lineno, file=None,
+ line=None, source=None):
+ self.message = message
+ self.category = category
+ self.filename = filename
+ self.lineno = lineno
+ self.file = file
+ self.line = line
+ self.source = source
+ self._category_name = category.__name__ if category else None
+
+ def __str__(self):
+ return ("{message : %r, category : %r, filename : %r, lineno : %s, "
+ "line : %r}" % (self.message, self._category_name,
+ self.filename, self.lineno, self.line))
+
+ def __repr__(self):
+ return f'<{type(self).__qualname__} {self}>'
+
+
+class catch_warnings(object):
+
+ """A context manager that copies and restores the warnings filter upon
+ exiting the context.
+
+ The 'record' argument specifies whether warnings should be captured by a
+ custom implementation of warnings.showwarning() and be appended to a list
+ returned by the context manager. Otherwise None is returned by the context
+ manager. The objects appended to the list are arguments whose attributes
+ mirror the arguments to showwarning().
+
+ The 'module' argument is to specify an alternative module to the module
+ named 'warnings' and imported under that name. This argument is only useful
+ when testing the warnings module itself.
+
+ If the 'action' argument is not None, the remaining arguments are passed
+ to warnings.simplefilter() as if it were called immediately on entering the
+ context.
+ """
+
+ def __init__(self, *, record=False, module=None,
+ action=None, category=Warning, lineno=0, append=False):
+ """Specify whether to record warnings and if an alternative module
+ should be used other than sys.modules['warnings'].
+
+ """
+ self._record = record
+ self._module = sys.modules['warnings'] if module is None else module
+ self._entered = False
+ if action is None:
+ self._filter = None
+ else:
+ self._filter = (action, category, lineno, append)
+
+ def __repr__(self):
+ args = []
+ if self._record:
+ args.append("record=True")
+ if self._module is not sys.modules['warnings']:
+ args.append("module=%r" % self._module)
+ name = type(self).__name__
+ return "%s(%s)" % (name, ", ".join(args))
+
+ def __enter__(self):
+ if self._entered:
+ raise RuntimeError("Cannot enter %r twice" % self)
+ self._entered = True
+ with _wm._lock:
+ if _use_context:
+ self._saved_context, context = self._module._new_context()
+ else:
+ context = None
+ self._filters = self._module.filters
+ self._module.filters = self._filters[:]
+ self._showwarning = self._module.showwarning
+ self._showwarnmsg_impl = self._module._showwarnmsg_impl
+ self._module._filters_mutated_lock_held()
+ if self._record:
+ if _use_context:
+ context.log = log = []
+ else:
+ log = []
+ self._module._showwarnmsg_impl = log.append
+ # Reset showwarning() to the default implementation to make sure
+ # that _showwarnmsg() calls _showwarnmsg_impl()
+ self._module.showwarning = self._module._showwarning_orig
+ else:
+ log = None
+ if self._filter is not None:
+ self._module.simplefilter(*self._filter)
+ return log
+
+ def __exit__(self, *exc_info):
+ if not self._entered:
+ raise RuntimeError("Cannot exit %r without entering first" % self)
+ with _wm._lock:
+ if _use_context:
+ self._module._warnings_context.set(self._saved_context)
+ else:
+ self._module.filters = self._filters
+ self._module.showwarning = self._showwarning
+ self._module._showwarnmsg_impl = self._showwarnmsg_impl
+ self._module._filters_mutated_lock_held()
+
+
+class deprecated:
+ """Indicate that a class, function or overload is deprecated.
+
+ When this decorator is applied to an object, the type checker
+ will generate a diagnostic on usage of the deprecated object.
+
+ Usage:
+
+ @deprecated("Use B instead")
+ class A:
+ pass
+
+ @deprecated("Use g instead")
+ def f():
+ pass
+
+ @overload
+ @deprecated("int support is deprecated")
+ def g(x: int) -> int: ...
+ @overload
+ def g(x: str) -> int: ...
+
+ The warning specified by *category* will be emitted at runtime
+ on use of deprecated objects. For functions, that happens on calls;
+ for classes, on instantiation and on creation of subclasses.
+ If the *category* is ``None``, no warning is emitted at runtime.
+ The *stacklevel* determines where the
+ warning is emitted. If it is ``1`` (the default), the warning
+ is emitted at the direct caller of the deprecated object; if it
+ is higher, it is emitted further up the stack.
+ Static type checker behavior is not affected by the *category*
+ and *stacklevel* arguments.
+
+ The deprecation message passed to the decorator is saved in the
+ ``__deprecated__`` attribute on the decorated object.
+ If applied to an overload, the decorator
+ must be after the ``@overload`` decorator for the attribute to
+ exist on the overload as returned by ``get_overloads()``.
+
+ See PEP 702 for details.
+
+ """
+ def __init__(
+ self,
+ message: str,
+ /,
+ *,
+ category: type[Warning] | None = DeprecationWarning,
+ stacklevel: int = 1,
+ ) -> None:
+ if not isinstance(message, str):
+ raise TypeError(
+ f"Expected an object of type str for 'message', not {type(message).__name__!r}"
+ )
+ self.message = message
+ self.category = category
+ self.stacklevel = stacklevel
+
+ def __call__(self, arg, /):
+ # Make sure the inner functions created below don't
+ # retain a reference to self.
+ msg = self.message
+ category = self.category
+ stacklevel = self.stacklevel
+ if category is None:
+ arg.__deprecated__ = msg
+ return arg
+ elif isinstance(arg, type):
+ import functools
+ from types import MethodType
+
+ original_new = arg.__new__
+
+ @functools.wraps(original_new)
+ def __new__(cls, /, *args, **kwargs):
+ if cls is arg:
+ _wm.warn(msg, category=category, stacklevel=stacklevel + 1)
+ if original_new is not object.__new__:
+ return original_new(cls, *args, **kwargs)
+ # Mirrors a similar check in object.__new__.
+ elif cls.__init__ is object.__init__ and (args or kwargs):
+ raise TypeError(f"{cls.__name__}() takes no arguments")
+ else:
+ return original_new(cls)
+
+ arg.__new__ = staticmethod(__new__)
+
+ if "__init_subclass__" in arg.__dict__:
+ # __init_subclass__ is directly present on the decorated class.
+ # Synthesize a wrapper that calls this method directly.
+ original_init_subclass = arg.__init_subclass__
+ # We need slightly different behavior if __init_subclass__
+ # is a bound method (likely if it was implemented in Python).
+ # Otherwise, it likely means it's a builtin such as
+ # object's implementation of __init_subclass__.
+ if isinstance(original_init_subclass, MethodType):
+ original_init_subclass = original_init_subclass.__func__
+
+ @functools.wraps(original_init_subclass)
+ def __init_subclass__(*args, **kwargs):
+ _wm.warn(msg, category=category, stacklevel=stacklevel + 1)
+ return original_init_subclass(*args, **kwargs)
+ else:
+ def __init_subclass__(cls, *args, **kwargs):
+ _wm.warn(msg, category=category, stacklevel=stacklevel + 1)
+ return super(arg, cls).__init_subclass__(*args, **kwargs)
+
+ arg.__init_subclass__ = classmethod(__init_subclass__)
+
+ arg.__deprecated__ = __new__.__deprecated__ = msg
+ __init_subclass__.__deprecated__ = msg
+ return arg
+ elif callable(arg):
+ import functools
+ import inspect
+
+ @functools.wraps(arg)
+ def wrapper(*args, **kwargs):
+ _wm.warn(msg, category=category, stacklevel=stacklevel + 1)
+ return arg(*args, **kwargs)
+
+ if inspect.iscoroutinefunction(arg):
+ wrapper = inspect.markcoroutinefunction(wrapper)
+
+ arg.__deprecated__ = wrapper.__deprecated__ = msg
+ return wrapper
+ else:
+ raise TypeError(
+ "@deprecated decorator with non-None category must be applied to "
+ f"a class or callable, not {arg!r}"
+ )
+
+
+_DEPRECATED_MSG = "{name!r} is deprecated and slated for removal in Python {remove}"
+
+
+def _deprecated(name, message=_DEPRECATED_MSG, *, remove, _version=sys.version_info):
+ """Warn that *name* is deprecated or should be removed.
+
+ RuntimeError is raised if *remove* specifies a major/minor tuple older than
+ the current Python version or the same version but past the alpha.
+
+ The *message* argument is formatted with *name* and *remove* as a Python
+ version tuple (e.g. (3, 11)).
+
+ """
+ remove_formatted = f"{remove[0]}.{remove[1]}"
+ if (_version[:2] > remove) or (_version[:2] == remove and _version[3] != "alpha"):
+ msg = f"{name!r} was slated for removal after Python {remove_formatted} alpha"
+ raise RuntimeError(msg)
+ else:
+ msg = message.format(name=name, remove=remove_formatted)
+ _wm.warn(msg, DeprecationWarning, stacklevel=3)
+
+
+# Private utility function called by _PyErr_WarnUnawaitedCoroutine
+def _warn_unawaited_coroutine(coro):
+ msg_lines = [
+ f"coroutine '{coro.__qualname__}' was never awaited\n"
+ ]
+ if coro.cr_origin is not None:
+ import linecache, traceback
+ def extract():
+ for filename, lineno, funcname in reversed(coro.cr_origin):
+ line = linecache.getline(filename, lineno)
+ yield (filename, lineno, funcname, line)
+ msg_lines.append("Coroutine created at (most recent call last)\n")
+ msg_lines += traceback.format_list(list(extract()))
+ msg = "".join(msg_lines).rstrip("\n")
+ # Passing source= here means that if the user happens to have tracemalloc
+ # enabled and tracking where the coroutine was created, the warning will
+ # contain that traceback. This does mean that if they have *both*
+ # coroutine origin tracking *and* tracemalloc enabled, they'll get two
+ # partially-redundant tracebacks. If we wanted to be clever we could
+ # probably detect this case and avoid it, but for now we don't bother.
+ _wm.warn(
+ msg, category=RuntimeWarning, stacklevel=2, source=coro
+ )
+
+
+def _setup_defaults():
+ # Several warning categories are ignored by default in regular builds
+ if hasattr(sys, 'gettotalrefcount'):
+ return
+ _wm.filterwarnings("default", category=DeprecationWarning, module="__main__", append=1)
+ _wm.simplefilter("ignore", category=DeprecationWarning, append=1)
+ _wm.simplefilter("ignore", category=PendingDeprecationWarning, append=1)
+ _wm.simplefilter("ignore", category=ImportWarning, append=1)
+ _wm.simplefilter("ignore", category=ResourceWarning, append=1)
diff --git a/python/python3_13/examples/_pydatetime.py b/python/python3_14/examples/_pydatetime.py
similarity index 92%
rename from python/python3_13/examples/_pydatetime.py
rename to python/python3_14/examples/_pydatetime.py
index 34ccb2da13..70251dbb65 100644
--- a/python/python3_13/examples/_pydatetime.py
+++ b/python/python3_14/examples/_pydatetime.py
@@ -1,12 +1,10 @@
-"""Concrete date/time and related types.
-
-See http://www.iana.org/time-zones/repository/tz-link.html for
-time zone and DST data sources.
-"""
+"""Pure Python implementation of the datetime module."""
__all__ = ("date", "datetime", "time", "timedelta", "timezone", "tzinfo",
"MINYEAR", "MAXYEAR", "UTC")
+__name__ = "datetime"
+
import time as _time
import math as _math
@@ -18,10 +16,10 @@ def _cmp(x, y):
def _get_class_module(self):
module_name = self.__class__.__module__
- if module_name == '_pydatetime':
- return 'datetime'
+ if module_name == 'datetime':
+ return 'datetime.'
else:
- return module_name
+ return ''
MINYEAR = 1
MAXYEAR = 9999
@@ -64,14 +62,14 @@ def _days_in_month(year, month):
def _days_before_month(year, month):
"year, month -> number of days in year preceding first day of month."
- assert 1 <= month <= 12, 'month must be in 1..12'
+ assert 1 <= month <= 12, f"month must be in 1..12, not {month}"
return _DAYS_BEFORE_MONTH[month] + (month > 2 and _is_leap(year))
def _ymd2ord(year, month, day):
"year, month, day -> ordinal, considering 01-Jan-0001 as day 1."
- assert 1 <= month <= 12, 'month must be in 1..12'
+ assert 1 <= month <= 12, f"month must be in 1..12, not {month}"
dim = _days_in_month(year, month)
- assert 1 <= day <= dim, ('day must be in 1..%d' % dim)
+ assert 1 <= day <= dim, f"day must be in 1..{dim}, not {day}"
return (_days_before_year(year) +
_days_before_month(year, month) +
day)
@@ -204,6 +202,17 @@ def _format_offset(off, sep=':'):
s += '.%06d' % ss.microseconds
return s
+_normalize_century = None
+def _need_normalize_century():
+ global _normalize_century
+ if _normalize_century is None:
+ try:
+ _normalize_century = (
+ _time.strftime("%Y", (99, 1, 1, 0, 0, 0, 0, 1, 0)) != "0099")
+ except ValueError:
+ _normalize_century = True
+ return _normalize_century
+
# Correctly substitute for %z and %Z escapes in strftime formats.
def _wrap_strftime(object, format, timetuple):
# Don't call utcoffset() or tzname() unless actually needed.
@@ -261,6 +270,20 @@ def _wrap_strftime(object, format, timetuple):
# strftime is going to have at this: escape %
Zreplace = s.replace('%', '%%')
newformat.append(Zreplace)
+ # Note that datetime(1000, 1, 1).strftime('%G') == '1000' so
+ # year 1000 for %G can go on the fast path.
+ elif ((ch in 'YG' or ch in 'FC') and
+ object.year < 1000 and _need_normalize_century()):
+ if ch == 'G':
+ year = int(_time.strftime("%G", timetuple))
+ else:
+ year = object.year
+ if ch == 'C':
+ push('{:02}'.format(year // 100))
+ else:
+ push('{:04}'.format(year))
+ if ch == 'F':
+ push('-{:02}-{:02}'.format(*timetuple[1:3]))
else:
push('%')
push(ch)
@@ -399,9 +422,11 @@ def _parse_hh_mm_ss_ff(tstr):
if pos < len_str:
if tstr[pos] not in '.,':
- raise ValueError("Invalid microsecond component")
+ raise ValueError("Invalid microsecond separator")
else:
pos += 1
+ if not all(map(_is_ascii_digit, tstr[pos:])):
+ raise ValueError("Non-digit values in fraction")
len_remainder = len_str - pos
@@ -413,9 +438,6 @@ def _parse_hh_mm_ss_ff(tstr):
time_comps[3] = int(tstr[pos:(pos+to_parse)])
if to_parse < 6:
time_comps[3] *= _FRACTION_CORRECTION[to_parse-1]
- if (len_remainder > to_parse
- and not all(map(_is_ascii_digit, tstr[(pos+to_parse):]))):
- raise ValueError("Non-digit values in unparsed fraction")
return time_comps
@@ -431,6 +453,17 @@ def _parse_isoformat_time(tstr):
time_comps = _parse_hh_mm_ss_ff(timestr)
+ hour, minute, second, microsecond = time_comps
+ became_next_day = False
+ error_from_components = False
+ if (hour == 24):
+ if all(time_comp == 0 for time_comp in time_comps[1:]):
+ hour = 0
+ time_comps[0] = hour
+ became_next_day = True
+ else:
+ error_from_components = True
+
tzi = None
if tz_pos == len_str and tstr[-1] == 'Z':
tzi = timezone.utc
@@ -446,7 +479,7 @@ def _parse_isoformat_time(tstr):
# HH:MM:SS len: 8
# HH:MM:SS.f+ len: 10+
- if len(tzstr) in (0, 1, 3):
+ if len(tzstr) in (0, 1, 3) or tstr[tz_pos-1] == 'Z':
raise ValueError("Malformed time zone string")
tz_comps = _parse_hh_mm_ss_ff(tzstr)
@@ -463,13 +496,13 @@ def _parse_isoformat_time(tstr):
time_comps.append(tzi)
- return time_comps
+ return time_comps, became_next_day, error_from_components
# tuple[int, int, int] -> tuple[int, int, int] version of date.fromisocalendar
def _isoweek_to_gregorian(year, week, day):
# Year is bounded this way because 9999-12-31 is (9999, 52, 5)
if not MINYEAR <= year <= MAXYEAR:
- raise ValueError(f"Year is out of range: {year}")
+ raise ValueError(f"year must be in {MINYEAR}..{MAXYEAR}, not {year}")
if not 0 < week < 53:
out_of_range = True
@@ -502,7 +535,7 @@ def _isoweek_to_gregorian(year, week, day):
def _check_tzname(name):
if name is not None and not isinstance(name, str):
raise TypeError("tzinfo.tzname() must return None or string, "
- "not '%s'" % type(name))
+ f"not {type(name).__name__!r}")
# name is the offset-producing method, "utcoffset" or "dst".
# offset is what it returned.
@@ -515,24 +548,24 @@ def _check_utc_offset(name, offset):
if offset is None:
return
if not isinstance(offset, timedelta):
- raise TypeError("tzinfo.%s() must return None "
- "or timedelta, not '%s'" % (name, type(offset)))
+ raise TypeError(f"tzinfo.{name}() must return None "
+ f"or timedelta, not {type(offset).__name__!r}")
if not -timedelta(1) < offset < timedelta(1):
- raise ValueError("%s()=%s, must be strictly between "
- "-timedelta(hours=24) and timedelta(hours=24)" %
- (name, offset))
+ raise ValueError("offset must be a timedelta "
+ "strictly between -timedelta(hours=24) and "
+ f"timedelta(hours=24), not {offset!r}")
def _check_date_fields(year, month, day):
year = _index(year)
month = _index(month)
day = _index(day)
if not MINYEAR <= year <= MAXYEAR:
- raise ValueError('year must be in %d..%d' % (MINYEAR, MAXYEAR), year)
+ raise ValueError(f"year must be in {MINYEAR}..{MAXYEAR}, not {year}")
if not 1 <= month <= 12:
- raise ValueError('month must be in 1..12', month)
+ raise ValueError(f"month must be in 1..12, not {month}")
dim = _days_in_month(year, month)
if not 1 <= day <= dim:
- raise ValueError('day must be in 1..%d' % dim, day)
+ raise ValueError(f"day {day} must be in range 1..{dim} for month {month} in year {year}")
return year, month, day
def _check_time_fields(hour, minute, second, microsecond, fold):
@@ -541,20 +574,23 @@ def _check_time_fields(hour, minute, second, microsecond, fold):
second = _index(second)
microsecond = _index(microsecond)
if not 0 <= hour <= 23:
- raise ValueError('hour must be in 0..23', hour)
+ raise ValueError(f"hour must be in 0..23, not {hour}")
if not 0 <= minute <= 59:
- raise ValueError('minute must be in 0..59', minute)
+ raise ValueError(f"minute must be in 0..59, not {minute}")
if not 0 <= second <= 59:
- raise ValueError('second must be in 0..59', second)
+ raise ValueError(f"second must be in 0..59, not {second}")
if not 0 <= microsecond <= 999999:
- raise ValueError('microsecond must be in 0..999999', microsecond)
+ raise ValueError(f"microsecond must be in 0..999999, not {microsecond}")
if fold not in (0, 1):
- raise ValueError('fold must be either 0 or 1', fold)
+ raise ValueError(f"fold must be either 0 or 1, not {fold}")
return hour, minute, second, microsecond, fold
def _check_tzinfo_arg(tz):
if tz is not None and not isinstance(tz, tzinfo):
- raise TypeError("tzinfo argument must be None or of a tzinfo subclass")
+ raise TypeError(
+ "tzinfo argument must be None or of a tzinfo subclass, "
+ f"not {type(tz).__name__!r}"
+ )
def _divide_and_round(a, b):
"""divide a by b and round result to the nearest integer
@@ -608,7 +644,19 @@ def __new__(cls, days=0, seconds=0, microseconds=0,
# guide the C implementation; it's way more convoluted than speed-
# ignoring auto-overflow-to-long idiomatic Python could be.
- # XXX Check that all inputs are ints or floats.
+ for name, value in (
+ ("days", days),
+ ("seconds", seconds),
+ ("microseconds", microseconds),
+ ("milliseconds", milliseconds),
+ ("minutes", minutes),
+ ("hours", hours),
+ ("weeks", weeks)
+ ):
+ if not isinstance(value, (int, float)):
+ raise TypeError(
+ f"unsupported type for timedelta {name} component: {type(value).__name__}"
+ )
# Final values, all integer.
# s and us fit in 32-bit signed ints; d isn't bounded.
@@ -709,9 +757,9 @@ def __repr__(self):
args.append("microseconds=%d" % self._microseconds)
if not args:
args.append('0')
- return "%s.%s(%s)" % (_get_class_module(self),
- self.__class__.__qualname__,
- ', '.join(args))
+ return "%s%s(%s)" % (_get_class_module(self),
+ self.__class__.__qualname__,
+ ', '.join(args))
def __str__(self):
mm, ss = divmod(self._seconds, 60)
@@ -908,6 +956,7 @@ class date:
fromtimestamp()
today()
fromordinal()
+ strptime()
Operators:
@@ -990,8 +1039,12 @@ def fromordinal(cls, n):
@classmethod
def fromisoformat(cls, date_string):
"""Construct a date from a string in ISO 8601 format."""
+
if not isinstance(date_string, str):
- raise TypeError('fromisoformat: argument must be str')
+ raise TypeError('Argument must be a str')
+
+ if not date_string.isascii():
+ raise ValueError('Argument must be an ASCII str')
if len(date_string) not in (7, 8, 10):
raise ValueError(f'Invalid isoformat string: {date_string!r}')
@@ -1008,6 +1061,12 @@ def fromisocalendar(cls, year, week, day):
This is the inverse of the date.isocalendar() function"""
return cls(*_isoweek_to_gregorian(year, week, day))
+ @classmethod
+ def strptime(cls, date_string, format):
+ """Parse a date string according to the given format (like time.strptime())."""
+ import _strptime
+ return _strptime._strptime_datetime_date(cls, date_string, format)
+
# Conversions to string
def __repr__(self):
@@ -1017,11 +1076,11 @@ def __repr__(self):
>>> repr(d)
'datetime.date(2010, 1, 1)'
"""
- return "%s.%s(%d, %d, %d)" % (_get_class_module(self),
- self.__class__.__qualname__,
- self._year,
- self._month,
- self._day)
+ return "%s%s(%d, %d, %d)" % (_get_class_module(self),
+ self.__class__.__qualname__,
+ self._year,
+ self._month,
+ self._day)
# XXX These shouldn't depend on time.localtime(), because that
# clips the usable dates to [1970 .. 2038). At least ctime() is
# easily done without using strftime() -- that's better too because
@@ -1057,8 +1116,8 @@ def isoformat(self):
This is 'YYYY-MM-DD'.
References:
- - http://www.w3.org/TR/NOTE-datetime
- - http://www.cl.cam.ac.uk/~mgk25/iso-time.html
+ - https://www.w3.org/TR/NOTE-datetime
+ - https://www.cl.cam.ac.uk/~mgk25/iso-time.html
"""
return "%04d-%02d-%02d" % (self._year, self._month, self._day)
@@ -1192,7 +1251,7 @@ def isocalendar(self):
The first week is 1; Monday is 1 ... Sunday is 7.
ISO calendar algorithm taken from
- http://www.phys.uu.nl/~vgent/calendar/isocalendar.htm
+ https://www.phys.uu.nl/~vgent/calendar/isocalendar.htm
(used with permission)
"""
year = self._year
@@ -1328,6 +1387,7 @@ class time:
Constructors:
__new__()
+ strptime()
Operators:
@@ -1386,6 +1446,12 @@ def __new__(cls, hour=0, minute=0, second=0, microsecond=0, tzinfo=None, *, fold
self._fold = fold
return self
+ @classmethod
+ def strptime(cls, date_string, format):
+ """string, format -> new time parsed from a string (like time.strptime())."""
+ import _strptime
+ return _strptime._strptime_datetime_time(cls, date_string, format)
+
# Read-only field accessors
@property
def hour(self):
@@ -1514,7 +1580,7 @@ def __repr__(self):
s = ", %d" % self._second
else:
s = ""
- s= "%s.%s(%d, %d%s)" % (_get_class_module(self),
+ s = "%s%s(%d, %d%s)" % (_get_class_module(self),
self.__class__.__qualname__,
self._hour, self._minute, s)
if self._tzinfo is not None:
@@ -1556,7 +1622,7 @@ def fromisoformat(cls, time_string):
time_string = time_string.removeprefix('T')
try:
- return cls(*_parse_isoformat_time(time_string))
+ return cls(*_parse_isoformat_time(time_string)[0])
except Exception:
raise ValueError(f'Invalid isoformat string: {time_string!r}')
@@ -1870,10 +1936,27 @@ def fromisoformat(cls, date_string):
if tstr:
try:
- time_components = _parse_isoformat_time(tstr)
+ time_components, became_next_day, error_from_components = _parse_isoformat_time(tstr)
except ValueError:
raise ValueError(
f'Invalid isoformat string: {date_string!r}') from None
+ else:
+ if error_from_components:
+ raise ValueError("minute, second, and microsecond must be 0 when hour is 24")
+
+ if became_next_day:
+ year, month, day = date_components
+ # Only wrap day/month when it was previously valid
+ if month <= 12 and day <= (days_in_month := _days_in_month(year, month)):
+ # Calculate midnight of the next day
+ day += 1
+ if day > days_in_month:
+ day = 1
+ month += 1
+ if month > 12:
+ month = 1
+ year += 1
+ date_components = [year, month, day]
else:
time_components = [0, 0, 0, 0, None]
@@ -2045,7 +2128,7 @@ def isoformat(self, sep='T', timespec='auto'):
By default, the fractional part is omitted if self.microsecond == 0.
If self.tzinfo is not None, the UTC offset is also attached, giving
- giving a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmm+HH:MM'.
+ a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmm+HH:MM'.
Optional argument sep specifies the separator between date and
time, default 'T'.
@@ -2073,9 +2156,9 @@ def __repr__(self):
del L[-1]
if L[-1] == 0:
del L[-1]
- s = "%s.%s(%s)" % (_get_class_module(self),
- self.__class__.__qualname__,
- ", ".join(map(str, L)))
+ s = "%s%s(%s)" % (_get_class_module(self),
+ self.__class__.__qualname__,
+ ", ".join(map(str, L)))
if self._tzinfo is not None:
assert s[-1:] == ")"
s = s[:-1] + ", tzinfo=%r" % self._tzinfo + ")"
@@ -2092,7 +2175,7 @@ def __str__(self):
def strptime(cls, date_string, format):
'string, format -> new datetime parsed from a string (like time.strptime()).'
import _strptime
- return _strptime._strptime_datetime(cls, date_string, format)
+ return _strptime._strptime_datetime_datetime(cls, date_string, format)
def utcoffset(self):
"""Return the timezone offset as timedelta positive east of UTC (negative west of
@@ -2306,7 +2389,6 @@ def __reduce__(self):
def _isoweek1monday(year):
# Helper to calculate the day number of the Monday starting week 1
- # XXX This could be done more efficiently
THURSDAY = 3
firstday = _ymd2ord(year, 1, 1)
firstweekday = (firstday + 6) % 7 # See weekday() above
@@ -2333,7 +2415,7 @@ def __new__(cls, offset, name=_Omitted):
if not cls._minoffset <= offset <= cls._maxoffset:
raise ValueError("offset must be a timedelta "
"strictly between -timedelta(hours=24) and "
- "timedelta(hours=24).")
+ f"timedelta(hours=24), not {offset!r}")
return cls._create(offset, name)
def __init_subclass__(cls):
@@ -2373,12 +2455,12 @@ def __repr__(self):
if self is self.utc:
return 'datetime.timezone.utc'
if self._name is None:
- return "%s.%s(%r)" % (_get_class_module(self),
- self.__class__.__qualname__,
- self._offset)
- return "%s.%s(%r, %r)" % (_get_class_module(self),
- self.__class__.__qualname__,
- self._offset, self._name)
+ return "%s%s(%r)" % (_get_class_module(self),
+ self.__class__.__qualname__,
+ self._offset)
+ return "%s%s(%r, %r)" % (_get_class_module(self),
+ self.__class__.__qualname__,
+ self._offset, self._name)
def __str__(self):
return self.tzname(None)
diff --git a/python/python3_13/examples/_pydecimal.py b/python/python3_14/examples/_pydecimal.py
similarity index 98%
rename from python/python3_13/examples/_pydecimal.py
rename to python/python3_14/examples/_pydecimal.py
index 75df3db262..97a629fe92 100644
--- a/python/python3_13/examples/_pydecimal.py
+++ b/python/python3_14/examples/_pydecimal.py
@@ -38,10 +38,10 @@
'ROUND_FLOOR', 'ROUND_UP', 'ROUND_HALF_DOWN', 'ROUND_05UP',
# Functions for manipulating contexts
- 'setcontext', 'getcontext', 'localcontext',
+ 'setcontext', 'getcontext', 'localcontext', 'IEEEContext',
# Limits for the C version for compatibility
- 'MAX_PREC', 'MAX_EMAX', 'MIN_EMIN', 'MIN_ETINY',
+ 'MAX_PREC', 'MAX_EMAX', 'MIN_EMIN', 'MIN_ETINY', 'IEEE_CONTEXT_MAX_BITS',
# C version: compile time choice that enables the thread local context (deprecated, now always true)
'HAVE_THREADS',
@@ -83,10 +83,12 @@
MAX_PREC = 999999999999999999
MAX_EMAX = 999999999999999999
MIN_EMIN = -999999999999999999
+ IEEE_CONTEXT_MAX_BITS = 512
else:
MAX_PREC = 425000000
MAX_EMAX = 425000000
MIN_EMIN = -425000000
+ IEEE_CONTEXT_MAX_BITS = 256
MIN_ETINY = MIN_EMIN - (MAX_PREC-1)
@@ -97,7 +99,7 @@ class DecimalException(ArithmeticError):
Used exceptions derive from this.
If an exception derives from another exception besides this (such as
- Underflow (Inexact, Rounded, Subnormal) that indicates that it is only
+ Underflow (Inexact, Rounded, Subnormal)) that indicates that it is only
called if the others are present. This isn't actually used for
anything, though.
@@ -145,7 +147,7 @@ class InvalidOperation(DecimalException):
x ** (+-)INF
An operand is invalid
- The result of the operation after these is a quiet positive NaN,
+ The result of the operation after this is a quiet positive NaN,
except when the cause is a signaling NaN, in which case the result is
also a quiet NaN, but with the original sign, and an optional
diagnostic information.
@@ -417,6 +419,27 @@ def sin(x):
return ctx_manager
+def IEEEContext(bits, /):
+ """
+ Return a context object initialized to the proper values for one of the
+ IEEE interchange formats. The argument must be a multiple of 32 and less
+ than IEEE_CONTEXT_MAX_BITS.
+ """
+ if bits <= 0 or bits > IEEE_CONTEXT_MAX_BITS or bits % 32:
+ raise ValueError("argument must be a multiple of 32, "
+ f"with a maximum of {IEEE_CONTEXT_MAX_BITS}")
+
+ ctx = Context()
+ ctx.prec = 9 * (bits//32) - 2
+ ctx.Emax = 3 * (1 << (bits//16 + 3))
+ ctx.Emin = 1 - ctx.Emax
+ ctx.rounding = ROUND_HALF_EVEN
+ ctx.clamp = 1
+ ctx.traps = dict.fromkeys(_signals, False)
+
+ return ctx
+
+
##### Decimal class #######################################################
# Do not subclass Decimal from numbers.Real and do not register it as such
@@ -582,6 +605,21 @@ def __new__(cls, value="0", context=None):
raise TypeError("Cannot convert %r to Decimal" % value)
+ @classmethod
+ def from_number(cls, number):
+ """Converts a real number to a decimal number, exactly.
+
+ >>> Decimal.from_number(314) # int
+ Decimal('314')
+ >>> Decimal.from_number(0.1) # float
+ Decimal('0.1000000000000000055511151231257827021181583404541015625')
+ >>> Decimal.from_number(Decimal('3.14')) # another decimal instance
+ Decimal('3.14')
+ """
+ if isinstance(number, (int, Decimal, float)):
+ return cls(number)
+ raise TypeError("Cannot convert %r to Decimal" % number)
+
@classmethod
def from_float(cls, f):
"""Converts a float to a decimal number, exactly.
@@ -2425,12 +2463,12 @@ def __pow__(self, other, modulo=None, context=None):
return ans
- def __rpow__(self, other, context=None):
+ def __rpow__(self, other, modulo=None, context=None):
"""Swaps self/other and returns __pow__."""
other = _convert_other(other)
if other is NotImplemented:
return other
- return other.__pow__(self, context=context)
+ return other.__pow__(self, modulo, context=context)
def normalize(self, context=None):
"""Normalize- strip trailing 0s, change anything equal to 0 to 0e0"""
@@ -3302,7 +3340,10 @@ def _fill_logical(self, context, opa, opb):
return opa, opb
def logical_and(self, other, context=None):
- """Applies an 'and' operation between self and other's digits."""
+ """Applies an 'and' operation between self and other's digits.
+
+ Both self and other must be logical numbers.
+ """
if context is None:
context = getcontext()
@@ -3319,14 +3360,20 @@ def logical_and(self, other, context=None):
return _dec_from_triple(0, result.lstrip('0') or '0', 0)
def logical_invert(self, context=None):
- """Invert all its digits."""
+ """Invert all its digits.
+
+ The self must be logical number.
+ """
if context is None:
context = getcontext()
return self.logical_xor(_dec_from_triple(0,'1'*context.prec,0),
context)
def logical_or(self, other, context=None):
- """Applies an 'or' operation between self and other's digits."""
+ """Applies an 'or' operation between self and other's digits.
+
+ Both self and other must be logical numbers.
+ """
if context is None:
context = getcontext()
@@ -3343,7 +3390,10 @@ def logical_or(self, other, context=None):
return _dec_from_triple(0, result.lstrip('0') or '0', 0)
def logical_xor(self, other, context=None):
- """Applies an 'xor' operation between self and other's digits."""
+ """Applies an 'xor' operation between self and other's digits.
+
+ Both self and other must be logical numbers.
+ """
if context is None:
context = getcontext()
@@ -6058,7 +6108,7 @@ def _convert_for_comparison(self, other, equality_op=False):
(?P\d*) # with (possibly empty) diagnostic info.
)
# \s*
- \Z
+ \z
""", re.VERBOSE | re.IGNORECASE).match
_all_zeros = re.compile('0*$').match
@@ -6082,11 +6132,15 @@ def _convert_for_comparison(self, other, equality_op=False):
(?Pz)?
(?P\#)?
(?P0)?
-(?P(?!0)\d+)?
-(?P,)?
-(?:\.(?P0|(?!0)\d+))?
+(?P\d+)?
+(?P[,_])?
+(?:\.
+ (?=[\d,_]) # lookahead for digit or separator
+ (?P\d+)?
+ (?P[,_])?
+)?
(?P[eEfFgGn%])?
-\Z
+\z
""", re.VERBOSE|re.DOTALL)
del re
@@ -6177,6 +6231,9 @@ def _parse_format_specifier(format_spec, _localeconv=None):
format_dict['grouping'] = [3, 0]
format_dict['decimal_point'] = '.'
+ if format_dict['frac_separators'] is None:
+ format_dict['frac_separators'] = ''
+
return format_dict
def _format_align(sign, body, spec):
@@ -6296,6 +6353,11 @@ def _format_number(is_negative, intpart, fracpart, exp, spec):
sign = _format_sign(is_negative, spec)
+ frac_sep = spec['frac_separators']
+ if fracpart and frac_sep:
+ fracpart = frac_sep.join(fracpart[pos:pos + 3]
+ for pos in range(0, len(fracpart), 3))
+
if fracpart or spec['alt']:
fracpart = spec['decimal_point'] + fracpart
diff --git a/python/python3_13/examples/_pyio.py b/python/python3_14/examples/_pyio.py
similarity index 94%
rename from python/python3_13/examples/_pyio.py
rename to python/python3_14/examples/_pyio.py
index a3fede6992..612e4a175e 100644
--- a/python/python3_13/examples/_pyio.py
+++ b/python/python3_14/examples/_pyio.py
@@ -16,15 +16,16 @@
_setmode = None
import io
-from io import (__all__, SEEK_SET, SEEK_CUR, SEEK_END)
+from io import (__all__, SEEK_SET, SEEK_CUR, SEEK_END, Reader, Writer) # noqa: F401
valid_seek_flags = {0, 1, 2} # Hardwired values
if hasattr(os, 'SEEK_HOLE') :
valid_seek_flags.add(os.SEEK_HOLE)
valid_seek_flags.add(os.SEEK_DATA)
-# open() uses st_blksize whenever we can
-DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
+# open() uses max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)
+# when the device block size is available.
+DEFAULT_BUFFER_SIZE = 128 * 1024 # bytes
# NOTE: Base classes defined here are registered with the "official" ABCs
# defined in io.py. We don't use real inheritance though, because we don't want
@@ -123,10 +124,10 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
the size of a fixed-size chunk buffer. When no buffering argument is
given, the default buffering policy works as follows:
- * Binary files are buffered in fixed-size chunks; the size of the buffer
- is chosen using a heuristic trying to determine the underlying device's
- "block size" and falling back on `io.DEFAULT_BUFFER_SIZE`.
- On many systems, the buffer will typically be 4096 or 8192 bytes long.
+ * Binary files are buffered in fixed-size chunks; the size of the buffer
+ is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)
+ when the device block size is available.
+ On most systems, the buffer will typically be 128 kilobytes long.
* "Interactive" text files (files for which isatty() returns True)
use line buffering. Other text files use the policy described above
@@ -238,18 +239,11 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
result = raw
try:
line_buffering = False
- if buffering == 1 or buffering < 0 and raw.isatty():
+ if buffering == 1 or buffering < 0 and raw._isatty_open_only():
buffering = -1
line_buffering = True
if buffering < 0:
- buffering = DEFAULT_BUFFER_SIZE
- try:
- bs = os.fstat(raw.fileno()).st_blksize
- except (OSError, AttributeError):
- pass
- else:
- if bs > 1:
- buffering = bs
+ buffering = max(min(raw._blksize, 8192 * 1024), DEFAULT_BUFFER_SIZE)
if buffering < 0:
raise ValueError("invalid buffering size")
if buffering == 0:
@@ -413,6 +407,9 @@ def __del__(self):
if closed:
return
+ if dealloc_warn := getattr(self, "_dealloc_warn", None):
+ dealloc_warn(self)
+
# If close() fails, the caller logs the exception with
# sys.unraisablehook. close() must be called at the end at __del__().
self.close()
@@ -620,6 +617,8 @@ def read(self, size=-1):
n = self.readinto(b)
if n is None:
return None
+ if n < 0 or n > len(b):
+ raise ValueError(f"readinto returned {n} outside buffer size {len(b)}")
del b[n:]
return bytes(b)
@@ -651,8 +650,6 @@ def write(self, b):
self._unsupported("write")
io.RawIOBase.register(RawIOBase)
-from _io import FileIO
-RawIOBase.register(FileIO)
class BufferedIOBase(IOBase):
@@ -859,6 +856,10 @@ def __repr__(self):
else:
return "<{}.{} name={!r}>".format(modname, clsname, name)
+ def _dealloc_warn(self, source):
+ if dealloc_warn := getattr(self.raw, "_dealloc_warn", None):
+ dealloc_warn(source)
+
### Lower-level APIs ###
def fileno(self):
@@ -944,10 +945,8 @@ def write(self, b):
return 0
pos = self._pos
if pos > len(self._buffer):
- # Inserts null bytes between the current end of the file
- # and the new write position.
- padding = b'\x00' * (pos - len(self._buffer))
- self._buffer += padding
+ # Pad buffer to pos with null bytes.
+ self._buffer.resize(pos)
self._buffer[pos:pos + n] = b
self._pos += n
return n
@@ -1463,6 +1462,17 @@ def write(self, b):
return BufferedWriter.write(self, b)
+def _new_buffersize(bytes_read):
+ # Parallels _io/fileio.c new_buffersize
+ if bytes_read > 65536:
+ addend = bytes_read >> 3
+ else:
+ addend = 256 + bytes_read
+ if addend < DEFAULT_BUFFER_SIZE:
+ addend = DEFAULT_BUFFER_SIZE
+ return bytes_read + addend
+
+
class FileIO(RawIOBase):
_fd = -1
_created = False
@@ -1487,6 +1497,7 @@ def __init__(self, file, mode='r', closefd=True, opener=None):
"""
if self._fd >= 0:
# Have to close the existing file first.
+ self._stat_atopen = None
try:
if self._closefd:
os.close(self._fd)
@@ -1559,24 +1570,22 @@ def __init__(self, file, mode='r', closefd=True, opener=None):
if not isinstance(fd, int):
raise TypeError('expected integer from opener')
if fd < 0:
- raise OSError('Negative file descriptor')
+ # bpo-27066: Raise a ValueError for bad value.
+ raise ValueError(f'opener returned {fd}')
owned_fd = fd
if not noinherit_flag:
os.set_inheritable(fd, False)
self._closefd = closefd
- fdfstat = os.fstat(fd)
+ self._stat_atopen = os.fstat(fd)
try:
- if stat.S_ISDIR(fdfstat.st_mode):
+ if stat.S_ISDIR(self._stat_atopen.st_mode):
raise IsADirectoryError(errno.EISDIR,
os.strerror(errno.EISDIR), file)
except AttributeError:
# Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR
# don't exist.
pass
- self._blksize = getattr(fdfstat, 'st_blksize', 0)
- if self._blksize <= 1:
- self._blksize = DEFAULT_BUFFER_SIZE
if _setmode:
# don't translate newlines (\r\n <=> \n)
@@ -1593,17 +1602,17 @@ def __init__(self, file, mode='r', closefd=True, opener=None):
if e.errno != errno.ESPIPE:
raise
except:
+ self._stat_atopen = None
if owned_fd is not None:
os.close(owned_fd)
raise
self._fd = fd
- def __del__(self):
+ def _dealloc_warn(self, source):
if self._fd >= 0 and self._closefd and not self.closed:
import warnings
- warnings.warn('unclosed file %r' % (self,), ResourceWarning,
+ warnings.warn(f'unclosed file {source!r}', ResourceWarning,
stacklevel=2, source=self)
- self.close()
def __getstate__(self):
raise TypeError(f"cannot pickle {self.__class__.__name__!r} object")
@@ -1622,6 +1631,17 @@ def __repr__(self):
return ('<%s name=%r mode=%r closefd=%r>' %
(class_name, name, self.mode, self._closefd))
+ @property
+ def _blksize(self):
+ if self._stat_atopen is None:
+ return DEFAULT_BUFFER_SIZE
+
+ blksize = getattr(self._stat_atopen, "st_blksize", 0)
+ # WASI sets blsize to 0
+ if not blksize:
+ return DEFAULT_BUFFER_SIZE
+ return blksize
+
def _checkReadable(self):
if not self._readable:
raise UnsupportedOperation('File not open for reading')
@@ -1633,7 +1653,13 @@ def _checkWritable(self, msg=None):
def read(self, size=None):
"""Read at most size bytes, returned as bytes.
- Only makes one system call, so less data may be returned than requested
+ If size is less than 0, read all bytes in the file making
+ multiple read calls. See ``FileIO.readall``.
+
+ Attempts to make only one system call, retrying only per
+ PEP 475 (EINTR). This means less data may be returned than
+ requested.
+
In non-blocking mode, returns None if no data is available.
Return an empty bytes object at EOF.
"""
@@ -1649,45 +1675,57 @@ def read(self, size=None):
def readall(self):
"""Read all data from the file, returned as bytes.
- In non-blocking mode, returns as much as is immediately available,
- or None if no data is available. Return an empty bytes object at EOF.
+ Reads until either there is an error or read() returns size 0
+ (indicates EOF). If the file is already at EOF, returns an
+ empty bytes object.
+
+ In non-blocking mode, returns as much data as could be read
+ before EAGAIN. If no data is available (EAGAIN is returned
+ before bytes are read) returns None.
"""
self._checkClosed()
self._checkReadable()
- bufsize = DEFAULT_BUFFER_SIZE
- try:
- pos = os.lseek(self._fd, 0, SEEK_CUR)
- end = os.fstat(self._fd).st_size
- if end >= pos:
- bufsize = end - pos + 1
- except OSError:
- pass
+ if self._stat_atopen is None or self._stat_atopen.st_size <= 0:
+ bufsize = DEFAULT_BUFFER_SIZE
+ else:
+ # In order to detect end of file, need a read() of at least 1
+ # byte which returns size 0. Oversize the buffer by 1 byte so the
+ # I/O can be completed with two read() calls (one for all data, one
+ # for EOF) without needing to resize the buffer.
+ bufsize = self._stat_atopen.st_size + 1
- result = bytearray()
- while True:
- if len(result) >= bufsize:
- bufsize = len(result)
- bufsize += max(bufsize, DEFAULT_BUFFER_SIZE)
- n = bufsize - len(result)
- try:
- chunk = os.read(self._fd, n)
- except BlockingIOError:
- if result:
- break
+ if self._stat_atopen.st_size > 65536:
+ try:
+ pos = os.lseek(self._fd, 0, SEEK_CUR)
+ if self._stat_atopen.st_size >= pos:
+ bufsize = self._stat_atopen.st_size - pos + 1
+ except OSError:
+ pass
+
+ result = bytearray(bufsize)
+ bytes_read = 0
+ try:
+ while n := os.readinto(self._fd, memoryview(result)[bytes_read:]):
+ bytes_read += n
+ if bytes_read >= len(result):
+ result.resize(_new_buffersize(bytes_read))
+ except BlockingIOError:
+ if not bytes_read:
return None
- if not chunk: # reached the end of the file
- break
- result += chunk
+ assert len(result) - bytes_read >= 1, \
+ "os.readinto buffer size 0 will result in erroneous EOF / returns 0"
+ result.resize(bytes_read)
return bytes(result)
- def readinto(self, b):
+ def readinto(self, buffer):
"""Same as RawIOBase.readinto()."""
- m = memoryview(b).cast('B')
- data = self.read(len(m))
- n = len(data)
- m[:n] = data
- return n
+ self._checkClosed()
+ self._checkReadable()
+ try:
+ return os.readinto(self._fd, buffer)
+ except BlockingIOError:
+ return None
def write(self, b):
"""Write bytes b to file, return number written.
@@ -1737,6 +1775,7 @@ def truncate(self, size=None):
if size is None:
size = self.tell()
os.ftruncate(self._fd, size)
+ self._stat_atopen = None
return size
def close(self):
@@ -1746,8 +1785,9 @@ def close(self):
called more than once without error.
"""
if not self.closed:
+ self._stat_atopen = None
try:
- if self._closefd:
+ if self._closefd and self._fd >= 0:
os.close(self._fd)
finally:
super().close()
@@ -1784,6 +1824,21 @@ def isatty(self):
self._checkClosed()
return os.isatty(self._fd)
+ def _isatty_open_only(self):
+ """Checks whether the file is a TTY using an open-only optimization.
+
+ TTYs are always character devices. If the interpreter knows a file is
+ not a character device when it would call ``isatty``, can skip that
+ call. Inside ``open()`` there is a fresh stat result that contains that
+ information. Use the stat result to skip a system call. Outside of that
+ context TOCTOU issues (the fd could be arbitrarily modified by
+ surrounding code).
+ """
+ if (self._stat_atopen is not None
+ and not stat.S_ISCHR(self._stat_atopen.st_mode)):
+ return False
+ return os.isatty(self._fd)
+
@property
def closefd(self):
"""True if the file descriptor will be closed by close()."""
@@ -2008,8 +2063,7 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
raise ValueError("invalid encoding: %r" % encoding)
if not codecs.lookup(encoding)._is_text_encoding:
- msg = ("%r is not a text encoding; "
- "use codecs.open() to handle arbitrary codecs")
+ msg = "%r is not a text encoding"
raise LookupError(msg % encoding)
if errors is None:
@@ -2517,9 +2571,12 @@ def read(self, size=None):
size = size_index()
decoder = self._decoder or self._get_decoder()
if size < 0:
+ chunk = self.buffer.read()
+ if chunk is None:
+ raise BlockingIOError("Read returned None.")
# Read everything.
result = (self._get_decoded_chars() +
- decoder.decode(self.buffer.read(), final=True))
+ decoder.decode(chunk, final=True))
if self._snapshot is not None:
self._set_decoded_chars('')
self._snapshot = None
@@ -2639,6 +2696,10 @@ def readline(self, size=None):
def newlines(self):
return self._decoder.newlines if self._decoder else None
+ def _dealloc_warn(self, source):
+ if dealloc_warn := getattr(self.buffer, "_dealloc_warn", None):
+ dealloc_warn(source)
+
class StringIO(TextIOWrapper):
"""Text I/O implementation using an in-memory buffer.
diff --git a/python/python3_14/examples/_pylong.py b/python/python3_14/examples/_pylong.py
new file mode 100644
index 0000000000..be1acd17ce
--- /dev/null
+++ b/python/python3_14/examples/_pylong.py
@@ -0,0 +1,729 @@
+"""Python implementations of some algorithms for use by longobject.c.
+The goal is to provide asymptotically faster algorithms that can be
+used for operations on integers with many digits. In those cases, the
+performance overhead of the Python implementation is not significant
+since the asymptotic behavior is what dominates runtime. Functions
+provided by this module should be considered private and not part of any
+public API.
+
+Note: for ease of maintainability, please prefer clear code and avoid
+"micro-optimizations". This module will only be imported and used for
+integers with a huge number of digits. Saving a few microseconds with
+tricky or non-obvious code is not worth it. For people looking for
+maximum performance, they should use something like gmpy2."""
+
+import re
+import decimal
+try:
+ import _decimal
+except ImportError:
+ _decimal = None
+
+# A number of functions have this form, where `w` is a desired number of
+# digits in base `base`:
+#
+# def inner(...w...):
+# if w <= LIMIT:
+# return something
+# lo = w >> 1
+# hi = w - lo
+# something involving base**lo, inner(...lo...), j, and inner(...hi...)
+# figure out largest w needed
+# result = inner(w)
+#
+# They all had some on-the-fly scheme to cache `base**lo` results for reuse.
+# Power is costly.
+#
+# This routine aims to compute all amd only the needed powers in advance, as
+# efficiently as reasonably possible. This isn't trivial, and all the
+# on-the-fly methods did needless work in many cases. The driving code above
+# changes to:
+#
+# figure out largest w needed
+# mycache = compute_powers(w, base, LIMIT)
+# result = inner(w)
+#
+# and `mycache[lo]` replaces `base**lo` in the inner function.
+#
+# If an algorithm wants the powers of ceiling(w/2) instead of the floor,
+# pass keyword argument `need_hi=True`.
+#
+# While this does give minor speedups (a few percent at best), the
+# primary intent is to simplify the functions using this, by eliminating
+# the need for them to craft their own ad-hoc caching schemes.
+#
+# See code near end of file for a block of code that can be enabled to
+# run millions of tests.
+def compute_powers(w, base, more_than, *, need_hi=False, show=False):
+ seen = set()
+ need = set()
+ ws = {w}
+ while ws:
+ w = ws.pop() # any element is fine to use next
+ if w in seen or w <= more_than:
+ continue
+ seen.add(w)
+ lo = w >> 1
+ hi = w - lo
+ # only _need_ one here; the other may, or may not, be needed
+ which = hi if need_hi else lo
+ need.add(which)
+ ws.add(which)
+ if lo != hi:
+ ws.add(w - which)
+
+ # `need` is the set of exponents needed. To compute them all
+ # efficiently, possibly add other exponents to `extra`. The goal is
+ # to ensure that each exponent can be gotten from a smaller one via
+ # multiplying by the base, squaring it, or squaring and then
+ # multiplying by the base.
+ #
+ # If need_hi is False, this is already the case (w can always be
+ # gotten from w >> 1 via one of the squaring strategies). But we do
+ # the work anyway, just in case ;-)
+ #
+ # Note that speed is irrelevant. These loops are working on little
+ # ints (exponents) and go around O(log w) times. The total cost is
+ # insignificant compared to just one of the bigint multiplies.
+ cands = need.copy()
+ extra = set()
+ while cands:
+ w = max(cands)
+ cands.remove(w)
+ lo = w >> 1
+ if lo > more_than and w-1 not in cands and lo not in cands:
+ extra.add(lo)
+ cands.add(lo)
+ assert need_hi or not extra
+
+ d = {}
+ for n in sorted(need | extra):
+ lo = n >> 1
+ hi = n - lo
+ if n-1 in d:
+ if show:
+ print("* base", end="")
+ result = d[n-1] * base # cheap!
+ elif lo in d:
+ # Multiplying a bigint by itself is about twice as fast
+ # in CPython provided it's the same object.
+ if show:
+ print("square", end="")
+ result = d[lo] * d[lo] # same object
+ if hi != lo:
+ if show:
+ print(" * base", end="")
+ assert 2 * lo + 1 == n
+ result *= base
+ else: # rare
+ if show:
+ print("pow", end='')
+ result = base ** n
+ if show:
+ print(" at", n, "needed" if n in need else "extra")
+ d[n] = result
+
+ assert need <= d.keys()
+ if excess := d.keys() - need:
+ assert need_hi
+ for n in excess:
+ del d[n]
+ return d
+
+_unbounded_dec_context = decimal.getcontext().copy()
+_unbounded_dec_context.prec = decimal.MAX_PREC
+_unbounded_dec_context.Emax = decimal.MAX_EMAX
+_unbounded_dec_context.Emin = decimal.MIN_EMIN
+_unbounded_dec_context.traps[decimal.Inexact] = 1 # sanity check
+
+def int_to_decimal(n):
+ """Asymptotically fast conversion of an 'int' to Decimal."""
+
+ # Function due to Tim Peters. See GH issue #90716 for details.
+ # https://github.com/python/cpython/issues/90716
+ #
+ # The implementation in longobject.c of base conversion algorithms
+ # between power-of-2 and non-power-of-2 bases are quadratic time.
+ # This function implements a divide-and-conquer algorithm that is
+ # faster for large numbers. Builds an equal decimal.Decimal in a
+ # "clever" recursive way. If we want a string representation, we
+ # apply str to _that_.
+
+ from decimal import Decimal as D
+ BITLIM = 200
+
+ # Don't bother caching the "lo" mask in this; the time to compute it is
+ # tiny compared to the multiply.
+ def inner(n, w):
+ if w <= BITLIM:
+ return D(n)
+ w2 = w >> 1
+ hi = n >> w2
+ lo = n & ((1 << w2) - 1)
+ return inner(lo, w2) + inner(hi, w - w2) * w2pow[w2]
+
+ with decimal.localcontext(_unbounded_dec_context):
+ nbits = n.bit_length()
+ w2pow = compute_powers(nbits, D(2), BITLIM)
+ if n < 0:
+ negate = True
+ n = -n
+ else:
+ negate = False
+ result = inner(n, nbits)
+ if negate:
+ result = -result
+ return result
+
+def int_to_decimal_string(n):
+ """Asymptotically fast conversion of an 'int' to a decimal string."""
+ w = n.bit_length()
+ if w > 450_000 and _decimal is not None:
+ # It is only usable with the C decimal implementation.
+ # _pydecimal.py calls str() on very large integers, which in its
+ # turn calls int_to_decimal_string(), causing very deep recursion.
+ return str(int_to_decimal(n))
+
+ # Fallback algorithm for the case when the C decimal module isn't
+ # available. This algorithm is asymptotically worse than the algorithm
+ # using the decimal module, but better than the quadratic time
+ # implementation in longobject.c.
+
+ DIGLIM = 1000
+ def inner(n, w):
+ if w <= DIGLIM:
+ return str(n)
+ w2 = w >> 1
+ hi, lo = divmod(n, pow10[w2])
+ return inner(hi, w - w2) + inner(lo, w2).zfill(w2)
+
+ # The estimation of the number of decimal digits.
+ # There is no harm in small error. If we guess too large, there may
+ # be leading 0's that need to be stripped. If we guess too small, we
+ # may need to call str() recursively for the remaining highest digits,
+ # which can still potentially be a large integer. This is manifested
+ # only if the number has way more than 10**15 digits, that exceeds
+ # the 52-bit physical address limit in both Intel64 and AMD64.
+ w = int(w * 0.3010299956639812 + 1) # log10(2)
+ pow10 = compute_powers(w, 5, DIGLIM)
+ for k, v in pow10.items():
+ pow10[k] = v << k # 5**k << k == 5**k * 2**k == 10**k
+ if n < 0:
+ n = -n
+ sign = '-'
+ else:
+ sign = ''
+ s = inner(n, w)
+ if s[0] == '0' and n:
+ # If our guess of w is too large, there may be leading 0's that
+ # need to be stripped.
+ s = s.lstrip('0')
+ return sign + s
+
+def _str_to_int_inner(s):
+ """Asymptotically fast conversion of a 'str' to an 'int'."""
+
+ # Function due to Bjorn Martinsson. See GH issue #90716 for details.
+ # https://github.com/python/cpython/issues/90716
+ #
+ # The implementation in longobject.c of base conversion algorithms
+ # between power-of-2 and non-power-of-2 bases are quadratic time.
+ # This function implements a divide-and-conquer algorithm making use
+ # of Python's built in big int multiplication. Since Python uses the
+ # Karatsuba algorithm for multiplication, the time complexity
+ # of this function is O(len(s)**1.58).
+
+ DIGLIM = 2048
+
+ def inner(a, b):
+ if b - a <= DIGLIM:
+ return int(s[a:b])
+ mid = (a + b + 1) >> 1
+ return (inner(mid, b)
+ + ((inner(a, mid) * w5pow[b - mid])
+ << (b - mid)))
+
+ w5pow = compute_powers(len(s), 5, DIGLIM)
+ return inner(0, len(s))
+
+
+# Asymptotically faster version, using the C decimal module. See
+# comments at the end of the file. This uses decimal arithmetic to
+# convert from base 10 to base 256. The latter is just a string of
+# bytes, which CPython can convert very efficiently to a Python int.
+
+# log of 10 to base 256 with best-possible 53-bit precision. Obtained
+# via:
+# from mpmath import mp
+# mp.prec = 1000
+# print(float(mp.log(10, 256)).hex())
+_LOG_10_BASE_256 = float.fromhex('0x1.a934f0979a371p-2') # about 0.415
+
+# _spread is for internal testing. It maps a key to the number of times
+# that condition obtained in _dec_str_to_int_inner:
+# key 0 - quotient guess was right
+# key 1 - quotient had to be boosted by 1, one time
+# key 999 - one adjustment wasn't enough, so fell back to divmod
+from collections import defaultdict
+_spread = defaultdict(int)
+del defaultdict
+
+def _dec_str_to_int_inner(s, *, GUARD=8):
+ # Yes, BYTELIM is "large". Large enough that CPython will usually
+ # use the Karatsuba _str_to_int_inner to convert the string. This
+ # allowed reducing the cutoff for calling _this_ function from 3.5M
+ # to 2M digits. We could almost certainly do even better by
+ # fine-tuning this and/or using a larger output base than 256.
+ BYTELIM = 100_000
+ D = decimal.Decimal
+ result = bytearray()
+ # See notes at end of file for discussion of GUARD.
+ assert GUARD > 0 # if 0, `decimal` can blow up - .prec 0 not allowed
+
+ def inner(n, w):
+ #assert n < D256 ** w # required, but too expensive to check
+ if w <= BYTELIM:
+ # XXX Stefan Pochmann discovered that, for 1024-bit ints,
+ # `int(Decimal)` took 2.5x longer than `int(str(Decimal))`.
+ # Worse, `int(Decimal) is still quadratic-time for much
+ # larger ints. So unless/until all that is repaired, the
+ # seemingly redundant `str(Decimal)` is crucial to speed.
+ result.extend(int(str(n)).to_bytes(w)) # big-endian default
+ return
+ w1 = w >> 1
+ w2 = w - w1
+ if 0:
+ # This is maximally clear, but "too slow". `decimal`
+ # division is asymptotically fast, but we have no way to
+ # tell it to reuse the high-precision reciprocal it computes
+ # for pow256[w2], so it has to recompute it over & over &
+ # over again :-(
+ hi, lo = divmod(n, pow256[w2][0])
+ else:
+ p256, recip = pow256[w2]
+ # The integer part will have a number of digits about equal
+ # to the difference between the log10s of `n` and `pow256`
+ # (which, since these are integers, is roughly approximated
+ # by `.adjusted()`). That's the working precision we need,
+ ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD
+ hi = +n * +recip # unary `+` chops back to ctx.prec digits
+ ctx.prec = decimal.MAX_PREC
+ hi = hi.to_integral_value() # lose the fractional digits
+ lo = n - hi * p256
+ # Because we've been uniformly rounding down, `hi` is a
+ # lower bound on the correct quotient.
+ assert lo >= 0
+ # Adjust quotient up if needed. It usually isn't. In random
+ # testing on inputs through 5 billion digit strings, the
+ # test triggered once in about 200 thousand tries.
+ count = 0
+ if lo >= p256:
+ count = 1
+ lo -= p256
+ hi += 1
+ if lo >= p256:
+ # Complete correction via an exact computation. I
+ # believe it's not possible to get here provided
+ # GUARD >= 3. It's tested by reducing GUARD below
+ # that.
+ count = 999
+ hi2, lo = divmod(lo, p256)
+ hi += hi2
+ _spread[count] += 1
+ # The assert should always succeed, but way too slow to keep
+ # enabled.
+ #assert hi, lo == divmod(n, pow256[w2][0])
+ inner(hi, w1)
+ del hi # at top levels, can free a lot of RAM "early"
+ inner(lo, w2)
+
+ # How many base 256 digits are needed?. Mathematically, exactly
+ # floor(log256(int(s))) + 1. There is no cheap way to compute this.
+ # But we can get an upper bound, and that's necessary for our error
+ # analysis to make sense. int(s) < 10**len(s), so the log needed is
+ # < log256(10**len(s)) = len(s) * log256(10). However, using
+ # finite-precision floating point for this, it's possible that the
+ # computed value is a little less than the true value. If the true
+ # value is at - or a little higher than - an integer, we can get an
+ # off-by-1 error too low. So we add 2 instead of 1 if chopping lost
+ # a fraction > 0.9.
+
+ # The "WASI" test platform can complain about `len(s)` if it's too
+ # large to fit in its idea of "an index-sized integer".
+ lenS = s.__len__()
+ log_ub = lenS * _LOG_10_BASE_256
+ log_ub_as_int = int(log_ub)
+ w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 0.9)
+ # And what if we've plain exhausted the limits of HW floats? We
+ # could compute the log to any desired precision using `decimal`,
+ # but it's not plausible that anyone will pass a string requiring
+ # trillions of bytes (unless they're just trying to "break things").
+ if w.bit_length() >= 46:
+ # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double.
+ raise ValueError(f"cannot convert string of len {lenS} to int")
+ with decimal.localcontext(_unbounded_dec_context) as ctx:
+ D256 = D(256)
+ pow256 = compute_powers(w, D256, BYTELIM, need_hi=True)
+ rpow256 = compute_powers(w, 1 / D256, BYTELIM, need_hi=True)
+ # We're going to do inexact, chopped arithmetic, multiplying by
+ # an approximation to the reciprocal of 256**i. We chop to get a
+ # lower bound on the true integer quotient. Our approximation is
+ # a lower bound, the multiplication is chopped too, and
+ # to_integral_value() is also chopped.
+ ctx.traps[decimal.Inexact] = 0
+ ctx.rounding = decimal.ROUND_DOWN
+ for k, v in pow256.items():
+ # No need to save much more precision in the reciprocal than
+ # the power of 256 has, plus some guard digits to absorb
+ # most relevant rounding errors. This is highly significant:
+ # 1/2**i has the same number of significant decimal digits
+ # as 5**i, generally over twice the number in 2**i,
+ ctx.prec = v.adjusted() + GUARD + 1
+ # The unary "+" chops the reciprocal back to that precision.
+ pow256[k] = v, +rpow256[k]
+ del rpow256 # exact reciprocals no longer needed
+ ctx.prec = decimal.MAX_PREC
+ inner(D(s), w)
+ return int.from_bytes(result)
+
+def int_from_string(s):
+ """Asymptotically fast version of PyLong_FromString(), conversion
+ of a string of decimal digits into an 'int'."""
+ # PyLong_FromString() has already removed leading +/-, checked for invalid
+ # use of underscore characters, checked that string consists of only digits
+ # and underscores, and stripped leading whitespace. The input can still
+ # contain underscores and have trailing whitespace.
+ s = s.rstrip().replace('_', '')
+ func = _str_to_int_inner
+ if len(s) >= 2_000_000 and _decimal is not None:
+ func = _dec_str_to_int_inner
+ return func(s)
+
+def str_to_int(s):
+ """Asymptotically fast version of decimal string to 'int' conversion."""
+ # FIXME: this doesn't support the full syntax that int() supports.
+ m = re.match(r'\s*([+-]?)([0-9_]+)\s*', s)
+ if not m:
+ raise ValueError('invalid literal for int() with base 10')
+ v = int_from_string(m.group(2))
+ if m.group(1) == '-':
+ v = -v
+ return v
+
+
+# Fast integer division, based on code from Mark Dickinson, fast_div.py
+# GH-47701. Additional refinements and optimizations by Bjorn Martinsson. The
+# algorithm is due to Burnikel and Ziegler, in their paper "Fast Recursive
+# Division".
+
+_DIV_LIMIT = 4000
+
+
+def _div2n1n(a, b, n):
+ """Divide a 2n-bit nonnegative integer a by an n-bit positive integer
+ b, using a recursive divide-and-conquer algorithm.
+
+ Inputs:
+ n is a positive integer
+ b is a positive integer with exactly n bits
+ a is a nonnegative integer such that a < 2**n * b
+
+ Output:
+ (q, r) such that a = b*q+r and 0 <= r < b.
+
+ """
+ if a.bit_length() - n <= _DIV_LIMIT:
+ return divmod(a, b)
+ pad = n & 1
+ if pad:
+ a <<= 1
+ b <<= 1
+ n += 1
+ half_n = n >> 1
+ mask = (1 << half_n) - 1
+ b1, b2 = b >> half_n, b & mask
+ q1, r = _div3n2n(a >> n, (a >> half_n) & mask, b, b1, b2, half_n)
+ q2, r = _div3n2n(r, a & mask, b, b1, b2, half_n)
+ if pad:
+ r >>= 1
+ return q1 << half_n | q2, r
+
+
+def _div3n2n(a12, a3, b, b1, b2, n):
+ """Helper function for _div2n1n; not intended to be called directly."""
+ if a12 >> n == b1:
+ q, r = (1 << n) - 1, a12 - (b1 << n) + b1
+ else:
+ q, r = _div2n1n(a12, b1, n)
+ r = (r << n | a3) - q * b2
+ while r < 0:
+ q -= 1
+ r += b
+ return q, r
+
+
+def _int2digits(a, n):
+ """Decompose non-negative int a into base 2**n
+
+ Input:
+ a is a non-negative integer
+
+ Output:
+ List of the digits of a in base 2**n in little-endian order,
+ meaning the most significant digit is last. The most
+ significant digit is guaranteed to be non-zero.
+ If a is 0 then the output is an empty list.
+
+ """
+ a_digits = [0] * ((a.bit_length() + n - 1) // n)
+
+ def inner(x, L, R):
+ if L + 1 == R:
+ a_digits[L] = x
+ return
+ mid = (L + R) >> 1
+ shift = (mid - L) * n
+ upper = x >> shift
+ lower = x ^ (upper << shift)
+ inner(lower, L, mid)
+ inner(upper, mid, R)
+
+ if a:
+ inner(a, 0, len(a_digits))
+ return a_digits
+
+
+def _digits2int(digits, n):
+ """Combine base-2**n digits into an int. This function is the
+ inverse of `_int2digits`. For more details, see _int2digits.
+ """
+
+ def inner(L, R):
+ if L + 1 == R:
+ return digits[L]
+ mid = (L + R) >> 1
+ shift = (mid - L) * n
+ return (inner(mid, R) << shift) + inner(L, mid)
+
+ return inner(0, len(digits)) if digits else 0
+
+
+def _divmod_pos(a, b):
+ """Divide a non-negative integer a by a positive integer b, giving
+ quotient and remainder."""
+ # Use grade-school algorithm in base 2**n, n = nbits(b)
+ n = b.bit_length()
+ a_digits = _int2digits(a, n)
+
+ r = 0
+ q_digits = []
+ for a_digit in reversed(a_digits):
+ q_digit, r = _div2n1n((r << n) + a_digit, b, n)
+ q_digits.append(q_digit)
+ q_digits.reverse()
+ q = _digits2int(q_digits, n)
+ return q, r
+
+
+def int_divmod(a, b):
+ """Asymptotically fast replacement for divmod, for 'int'.
+ Its time complexity is O(n**1.58), where n = #bits(a) + #bits(b).
+ """
+ if b == 0:
+ raise ZeroDivisionError('division by zero')
+ elif b < 0:
+ q, r = int_divmod(-a, -b)
+ return q, -r
+ elif a < 0:
+ q, r = int_divmod(~a, b)
+ return ~q, b + ~r
+ else:
+ return _divmod_pos(a, b)
+
+
+# Notes on _dec_str_to_int_inner:
+#
+# Stefan Pochmann worked up a str->int function that used the decimal
+# module to, in effect, convert from base 10 to base 256. This is
+# "unnatural", in that it requires multiplying and dividing by large
+# powers of 2, which `decimal` isn't naturally suited to. But
+# `decimal`'s `*` and `/` are asymptotically superior to CPython's, so
+# at _some_ point it could be expected to win.
+#
+# Alas, the crossover point was too high to be of much real interest. I
+# (Tim) then worked on ways to replace its division with multiplication
+# by a cached reciprocal approximation instead, fixing up errors
+# afterwards. This reduced the crossover point significantly,
+#
+# I revisited the code, and found ways to improve and simplify it. The
+# crossover point is at about 3.4 million digits now.
+#
+# About .adjusted()
+# -----------------
+# Restrict to Decimal values x > 0. We don't use negative numbers in the
+# code, and I don't want to have to keep typing, e.g., "absolute value".
+#
+# For convenience, I'll use `x.a` to mean `x.adjusted()`. x.a doesn't
+# look at the digits of x, but instead returns an integer giving x's
+# order of magnitude. These are equivalent:
+#
+# - x.a is the power-of-10 exponent of x's most significant digit.
+# - x.a = the infinitely precise floor(log10(x))
+# - x can be written in this form, where f is a real with 1 <= f < 10:
+# x = f * 10**x.a
+#
+# Observation; if x is an integer, len(str(x)) = x.a + 1.
+#
+# Lemma 1: (x * y).a = x.a + y.a, or one larger
+#
+# Proof: Write x = f * 10**x.a and y = g * 10**y.a, where f and g are in
+# [1, 10). Then x*y = f*g * 10**(x.a + y.a), where 1 <= f*g < 100. If
+# f*g < 10, (x*y).a is x.a+y.a. Else divide f*g by 10 to bring it back
+# into [1, 10], and add 1 to the exponent to compensate. Then (x*y).a is
+# x.a+y.a+1.
+#
+# Lemma 2: ceiling(log10(x/y)) <= x.a - y.a + 1
+#
+# Proof: Express x and y as in Lemma 1. Then x/y = f/g * 10**(x.a -
+# y.a), where 1/10 < f/g < 10. If 1 <= f/g, (x/y).a is x.a-y.a. Else
+# multiply f/g by 10 to bring it back into [1, 10], and subtract 1 from
+# the exponent to compensate. Then (x/y).a is x.a-y.a-1. So the largest
+# (x/y).a can be is x.a-y.a. Since that's the floor of log10(x/y). the
+# ceiling is at most 1 larger (with equality iff f/g = 1 exactly).
+#
+# GUARD digits
+# ------------
+# We only want the integer part of divisions, so don't need to build
+# the full multiplication tree. But using _just_ the number of
+# digits expected in the integer part ignores too much. What's left
+# out can have a very significant effect on the quotient. So we use
+# GUARD additional digits.
+#
+# The default 8 is more than enough so no more than 1 correction step
+# was ever needed for all inputs tried through 2.5 billion digits. In
+# fact, I believe 3 guard digits are always enough - but the proof is
+# very involved, so better safe than sorry.
+#
+# Short course:
+#
+# If prec is the decimal precision in effect, and we're rounding down,
+# the result of an operation is exactly equal to the infinitely precise
+# result times 1-e for some real e with 0 <= e < 10**(1-prec). In
+#
+# ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD
+# hi = +n * +recip # unary `+` chops to ctx.prec digits
+#
+# we have 3 visible chopped operations, but there's also a 4th:
+# precomputing a truncated `recip` as part of setup.
+#
+# So the computed product is exactly equal to the true product times
+# (1-e1)*(1-e2)*(1-e3)*(1-e4); since the e's are all very small, an
+# excellent approximation to the second factor is 1-(e1+e2+e3+e4) (the
+# 2nd and higher order terms in the expanded product are too tiny to
+# matter). If they're all as large as possible, that's
+#
+# 1 - 4*10**(1-prec). This, BTW, is all bog-standard FP error analysis.
+#
+# That implies the computed product is within 1 of the true product
+# provided prec >= log10(true_product) + 1.602.
+#
+# Here are telegraphic details, rephrasing the initial condition in
+# equivalent ways, step by step:
+#
+# prod - prod * (1 - 4*10**(1-prec)) <= 1
+# prod - prod + prod * 4*10**(1-prec)) <= 1
+# prod * 4*10**(1-prec)) <= 1
+# 10**(log10(prod)) * 4*10**(1-prec)) <= 1
+# 4*10**(1-prec+log10(prod))) <= 1
+# 10**(1-prec+log10(prod))) <= 1/4
+# 1-prec+log10(prod) <= log10(1/4) = -0.602
+# -prec <= -1.602 - log10(prod)
+# prec >= log10(prod) + 1.602
+#
+# The true product is the same as the true ratio n/p256. By Lemma 2
+# above, n.a - p256.a + 1 is an upper bound on the ceiling of
+# log10(prod). Then 2 is the ceiling of 1.602. so n.a - p256.a + 3 is an
+# upper bound on the right hand side of the inequality. Any prec >= that
+# will work.
+#
+# But since this is just a sketch of a proof ;-), the code uses the
+# empirically tested 8 instead of 3. 5 digits more or less makes no
+# practical difference to speed - these ints are huge. And while
+# increasing GUARD above 3 may not be necessary, every increase cuts the
+# percentage of cases that need a correction at all.
+#
+# On Computing Reciprocals
+# ------------------------
+# In general, the exact reciprocals we compute have over twice as many
+# significant digits as needed. 1/256**i has the same number of
+# significant decimal digits as 5**i. It's a significant waste of RAM
+# to store all those unneeded digits.
+#
+# So we cut exact reciprocals back to the least precision that can
+# be needed so that the error analysis above is valid,
+#
+# [Note: turns out it's very significantly faster to do it this way than
+# to compute 1 / 256**i directly to the desired precision, because the
+# power method doesn't require division. It's also faster than computing
+# (1/256)**i directly to the desired precision - no material division
+# there, but `compute_powers()` is much smarter about _how_ to compute
+# all the powers needed than repeated applications of `**` - that
+# function invokes `**` for at most the few smallest powers needed.]
+#
+# The hard part is that chopping back to a shorter width occurs
+# _outside_ of `inner`. We can't know then what `prec` `inner()` will
+# need. We have to pick, for each value of `w2`, the largest possible
+# value `prec` can become when `inner()` is working on `w2`.
+#
+# This is the `prec` inner() uses:
+# max(n.a - p256.a, 0) + GUARD
+# and what setup uses (renaming its `v` to `p256` - same thing):
+# p256.a + GUARD + 1
+#
+# We need that the second is always at least as large as the first,
+# which is the same as requiring
+#
+# n.a - 2 * p256.a <= 1
+#
+# What's the largest n can be? n < 255**w = 256**(w2 + (w - w2)). The
+# worst case in this context is when w ix even. and then w = 2*w2, so
+# n < 256**(2*w2) = (256**w2)**2 = p256**2. By Lemma 1, then, n.a
+# is at most p256.a + p256.a + 1.
+#
+# So the most n.a - 2 * p256.a can be is
+# p256.a + p256.a + 1 - 2 * p256.a = 1. QED
+#
+# Note: an earlier version of the code split on floor(e/2) instead of on
+# the ceiling. The worst case then is odd `w`, and a more involved proof
+# was needed to show that adding 4 (instead of 1) may be necessary.
+# Basically because, in that case, n may be up to 256 times larger than
+# p256**2. Curiously enough, by splitting on the ceiling instead,
+# nothing in any proof here actually depends on the output base (256).
+
+# Enable for brute-force testing of compute_powers(). This takes about a
+# minute, because it tries millions of cases.
+if 0:
+ def consumer(w, limit, need_hi):
+ seen = set()
+ need = set()
+ def inner(w):
+ if w <= limit:
+ return
+ if w in seen:
+ return
+ seen.add(w)
+ lo = w >> 1
+ hi = w - lo
+ need.add(hi if need_hi else lo)
+ inner(lo)
+ inner(hi)
+ inner(w)
+ exp = compute_powers(w, 1, limit, need_hi=need_hi)
+ assert exp.keys() == need
+
+ from itertools import chain
+ for need_hi in (False, True):
+ for limit in (0, 1, 10, 100, 1_000, 10_000, 100_000):
+ for w in chain(range(1, 100_000),
+ (10**i for i in range(5, 30))):
+ consumer(w, limit, need_hi)
diff --git a/python/python3_13/examples/_sitebuiltins.py b/python/python3_14/examples/_sitebuiltins.py
similarity index 100%
rename from python/python3_13/examples/_sitebuiltins.py
rename to python/python3_14/examples/_sitebuiltins.py
diff --git a/python/python3_13/examples/_strptime.py b/python/python3_14/examples/_strptime.py
similarity index 75%
rename from python/python3_13/examples/_strptime.py
rename to python/python3_14/examples/_strptime.py
index 4c68a6a88e..fc7e369c3d 100644
--- a/python/python3_13/examples/_strptime.py
+++ b/python/python3_14/examples/_strptime.py
@@ -14,6 +14,7 @@
import time
import locale
import calendar
+import re
from re import compile as re_compile
from re import sub as re_sub
from re import IGNORECASE
@@ -41,6 +42,29 @@ def _findall(haystack, needle):
yield i
i += len(needle)
+def _fixmonths(months):
+ yield from months
+ # The lower case of 'İ' ('\u0130') is 'i\u0307'.
+ # The re module only supports 1-to-1 character matching in
+ # case-insensitive mode.
+ for s in months:
+ if 'i\u0307' in s:
+ yield s.replace('i\u0307', '\u0130')
+
+lzh_TW_alt_digits = (
+ # 〇:一:二:三:四:五:六:七:八:九
+ '\u3007', '\u4e00', '\u4e8c', '\u4e09', '\u56db',
+ '\u4e94', '\u516d', '\u4e03', '\u516b', '\u4e5d',
+ # 十:十一:十二:十三:十四:十五:十六:十七:十八:十九
+ '\u5341', '\u5341\u4e00', '\u5341\u4e8c', '\u5341\u4e09', '\u5341\u56db',
+ '\u5341\u4e94', '\u5341\u516d', '\u5341\u4e03', '\u5341\u516b', '\u5341\u4e5d',
+ # 廿:廿一:廿二:廿三:廿四:廿五:廿六:廿七:廿八:廿九
+ '\u5eff', '\u5eff\u4e00', '\u5eff\u4e8c', '\u5eff\u4e09', '\u5eff\u56db',
+ '\u5eff\u4e94', '\u5eff\u516d', '\u5eff\u4e03', '\u5eff\u516b', '\u5eff\u4e5d',
+ # 卅:卅一
+ '\u5345', '\u5345\u4e00')
+
+
class LocaleTime(object):
"""Stores and handles locale-specific information related to time.
@@ -84,6 +108,7 @@ def __init__(self):
self.__calc_weekday()
self.__calc_month()
self.__calc_am_pm()
+ self.__calc_alt_digits()
self.__calc_timezone()
self.__calc_date_time()
if _getlang() != self.lang:
@@ -119,9 +144,43 @@ def __calc_am_pm(self):
am_pm.append(time.strftime("%p", time_tuple).lower().strip())
self.am_pm = am_pm
+ def __calc_alt_digits(self):
+ # Set self.LC_alt_digits by using time.strftime().
+
+ # The magic data should contain all decimal digits.
+ time_tuple = time.struct_time((1998, 1, 27, 10, 43, 56, 1, 27, 0))
+ s = time.strftime("%x%X", time_tuple)
+ if s.isascii():
+ # Fast path -- all digits are ASCII.
+ self.LC_alt_digits = ()
+ return
+
+ digits = ''.join(sorted(set(re.findall(r'\d', s))))
+ if len(digits) == 10 and ord(digits[-1]) == ord(digits[0]) + 9:
+ # All 10 decimal digits from the same set.
+ if digits.isascii():
+ # All digits are ASCII.
+ self.LC_alt_digits = ()
+ return
+
+ self.LC_alt_digits = [a + b for a in digits for b in digits]
+ # Test whether the numbers contain leading zero.
+ time_tuple2 = time.struct_time((2000, 1, 1, 1, 1, 1, 5, 1, 0))
+ if self.LC_alt_digits[1] not in time.strftime("%x %X", time_tuple2):
+ self.LC_alt_digits[:10] = digits
+ return
+
+ # Either non-Gregorian calendar or non-decimal numbers.
+ if {'\u4e00', '\u4e03', '\u4e5d', '\u5341', '\u5eff'}.issubset(s):
+ # lzh_TW
+ self.LC_alt_digits = lzh_TW_alt_digits
+ return
+
+ self.LC_alt_digits = None
+
def __calc_date_time(self):
- # Set self.date_time, self.date, & self.time by using
- # time.strftime().
+ # Set self.LC_date_time, self.LC_date, self.LC_time and
+ # self.LC_time_ampm by using time.strftime().
# Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
# overloaded numbers is minimized. The order in which searches for
@@ -129,26 +188,32 @@ def __calc_date_time(self):
# possible ambiguity for what something represents.
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
- replacement_pairs = [
+ replacement_pairs = []
+
+ # Non-ASCII digits
+ if self.LC_alt_digits or self.LC_alt_digits is None:
+ for n, d in [(19, '%OC'), (99, '%Oy'), (22, '%OH'),
+ (44, '%OM'), (55, '%OS'), (17, '%Od'),
+ (3, '%Om'), (2, '%Ow'), (10, '%OI')]:
+ if self.LC_alt_digits is None:
+ s = chr(0x660 + n // 10) + chr(0x660 + n % 10)
+ replacement_pairs.append((s, d))
+ if n < 10:
+ replacement_pairs.append((s[1], d))
+ elif len(self.LC_alt_digits) > n:
+ replacement_pairs.append((self.LC_alt_digits[n], d))
+ else:
+ replacement_pairs.append((time.strftime(d, time_tuple), d))
+ replacement_pairs += [
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
('44', '%M'), ('55', '%S'), ('76', '%j'),
('17', '%d'), ('03', '%m'), ('3', '%m'),
# '3' needed for when no leading zero.
('2', '%w'), ('10', '%I'),
- # Non-ASCII digits
- ('\u0661\u0669\u0669\u0669', '%Y'),
- ('\u0669\u0669', '%Oy'),
- ('\u0662\u0662', '%OH'),
- ('\u0664\u0664', '%OM'),
- ('\u0665\u0665', '%OS'),
- ('\u0661\u0667', '%Od'),
- ('\u0660\u0663', '%Om'),
- ('\u0663', '%Om'),
- ('\u0662', '%Ow'),
- ('\u0661\u0660', '%OI'),
]
+
date_time = []
- for directive in ('%c', '%x', '%X'):
+ for directive in ('%c', '%x', '%X', '%r'):
current_format = time.strftime(directive, time_tuple).lower()
current_format = current_format.replace('%', '%%')
# The month and the day of the week formats are treated specially
@@ -172,9 +237,10 @@ def __calc_date_time(self):
if tz:
current_format = current_format.replace(tz, "%Z")
# Transform all non-ASCII digits to digits in range U+0660 to U+0669.
- current_format = re_sub(r'\d(?3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
'f': r"(?P[0-9]{1,6})",
- 'H': r"(?P2[0-3]|[0-1]\d|\d)",
+ 'H': r"(?P2[0-3]|[0-1]\d|\d| \d)",
+ 'k': r"(?P2[0-3]|[0-1]\d|\d| \d)",
'I': r"(?P1[0-2]|0[1-9]|[1-9]| [1-9])",
+ 'l': r"(?P1[0-2]|0[1-9]|[1-9]| [1-9])",
'G': r"(?P\d\d\d\d)",
'j': r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
'm': r"(?P1[0-2]|0[1-9]|[1-9])",
@@ -301,29 +370,60 @@ def __init__(self, locale_time=None):
'V': r"(?P5[0-3]|0[1-9]|[1-4]\d|\d)",
# W is set below by using 'U'
'y': r"(?P\d\d)",
- #XXX: Does 'Y' need to worry about having less or more than
- # 4 digits?
'Y': r"(?P\d\d\d\d)",
'z': r"(?P[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|(?-i:Z))",
'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
- 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),
- 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'),
+ 'B': self.__seqToRE(_fixmonths(self.locale_time.f_month[1:]), 'B'),
+ 'b': self.__seqToRE(_fixmonths(self.locale_time.a_month[1:]), 'b'),
'p': self.__seqToRE(self.locale_time.am_pm, 'p'),
'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone
for tz in tz_names),
'Z'),
'%': '%'}
- for d in 'dmyHIMS':
- mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
- mapping['Ow'] = r'(?P\d)'
+ if self.locale_time.LC_alt_digits is None:
+ for d in 'dmyCHIMS':
+ mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
+ mapping['Ow'] = r'(?P\d)'
+ else:
+ mapping.update({
+ 'Od': self.__seqToRE(self.locale_time.LC_alt_digits[1:32], 'd',
+ '3[0-1]|[1-2][0-9]|0[1-9]|[1-9]'),
+ 'Om': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'm',
+ '1[0-2]|0[1-9]|[1-9]'),
+ 'Ow': self.__seqToRE(self.locale_time.LC_alt_digits[:7], 'w',
+ '[0-6]'),
+ 'Oy': self.__seqToRE(self.locale_time.LC_alt_digits, 'y',
+ '[0-9][0-9]'),
+ 'OC': self.__seqToRE(self.locale_time.LC_alt_digits, 'C',
+ '[0-9][0-9]'),
+ 'OH': self.__seqToRE(self.locale_time.LC_alt_digits[:24], 'H',
+ '2[0-3]|[0-1][0-9]|[0-9]'),
+ 'OI': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'I',
+ '1[0-2]|0[1-9]|[1-9]'),
+ 'OM': self.__seqToRE(self.locale_time.LC_alt_digits[:60], 'M',
+ '[0-5][0-9]|[0-9]'),
+ 'OS': self.__seqToRE(self.locale_time.LC_alt_digits[:62], 'S',
+ '6[0-1]|[0-5][0-9]|[0-9]'),
+ })
+ mapping.update({
+ 'e': mapping['d'],
+ 'Oe': mapping['Od'],
+ 'P': mapping['p'],
+ 'Op': mapping['p'],
+ 'W': mapping['U'].replace('U', 'W'),
+ })
mapping['W'] = mapping['U'].replace('U', 'W')
+
base.__init__(mapping)
+ base.__setitem__('T', self.pattern('%H:%M:%S'))
+ base.__setitem__('R', self.pattern('%H:%M'))
+ base.__setitem__('r', self.pattern(self.locale_time.LC_time_ampm))
base.__setitem__('X', self.pattern(self.locale_time.LC_time))
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
- def __seqToRE(self, to_convert, directive):
+ def __seqToRE(self, to_convert, directive, altregex=None):
"""Convert a list to a regex string for matching a directive.
Want possible matching values to be from longest to shortest. This
@@ -339,8 +439,9 @@ def __seqToRE(self, to_convert, directive):
else:
return ''
regex = '|'.join(re_escape(stuff) for stuff in to_convert)
- regex = '(?P<%s>%s' % (directive, regex)
- return '%s)' % regex
+ if altregex is not None:
+ regex += '|' + altregex
+ return '(?P<%s>%s)' % (directive, regex)
def pattern(self, format):
"""Return regex pattern for the format string.
@@ -367,11 +468,11 @@ def repl(m):
nonlocal day_of_month_in_format
day_of_month_in_format = True
return self[format_char]
- format = re_sub(r'%(O?.)', repl, format)
+ format = re_sub(r'%[-_0^#]*[0-9]*([OE]?\\?.?)', repl, format)
if day_of_month_in_format and not year_in_format:
import warnings
warnings.warn("""\
-Parsing dates involving a day of month without a year specified is ambiguious
+Parsing dates involving a day of month without a year specified is ambiguous
and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
@@ -441,14 +542,13 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
# \\, in which case it was a stray % but with a space after it
except KeyError as err:
bad_directive = err.args[0]
- if bad_directive == "\\":
- bad_directive = "%"
del err
+ bad_directive = bad_directive.replace('\\s', '')
+ if not bad_directive:
+ raise ValueError("stray %% in format '%s'" % format) from None
+ bad_directive = bad_directive.replace('\\', '', 1)
raise ValueError("'%s' is a bad directive in format '%s'" %
(bad_directive, format)) from None
- # IndexError only occurs when the format string is "%"
- except IndexError:
- raise ValueError("stray %% in format '%s'" % format) from None
_regex_cache[format] = format_regex
found = format_regex.match(data_string)
if not found:
@@ -470,6 +570,15 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
# values
weekday = julian = None
found_dict = found.groupdict()
+ if locale_time.LC_alt_digits:
+ def parse_int(s):
+ try:
+ return locale_time.LC_alt_digits.index(s)
+ except ValueError:
+ return int(s)
+ else:
+ parse_int = int
+
for group_key in found_dict.keys():
# Directives not explicitly handled below:
# c, x, X
@@ -477,30 +586,34 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
# U, W
# worthless without day of the week
if group_key == 'y':
- year = int(found_dict['y'])
- # Open Group specification for strptime() states that a %y
- #value in the range of [00, 68] is in the century 2000, while
- #[69,99] is in the century 1900
- if year <= 68:
- year += 2000
+ year = parse_int(found_dict['y'])
+ if 'C' in found_dict:
+ century = parse_int(found_dict['C'])
+ year += century * 100
else:
- year += 1900
+ # Open Group specification for strptime() states that a %y
+ #value in the range of [00, 68] is in the century 2000, while
+ #[69,99] is in the century 1900
+ if year <= 68:
+ year += 2000
+ else:
+ year += 1900
elif group_key == 'Y':
year = int(found_dict['Y'])
elif group_key == 'G':
iso_year = int(found_dict['G'])
elif group_key == 'm':
- month = int(found_dict['m'])
+ month = parse_int(found_dict['m'])
elif group_key == 'B':
month = locale_time.f_month.index(found_dict['B'].lower())
elif group_key == 'b':
month = locale_time.a_month.index(found_dict['b'].lower())
elif group_key == 'd':
- day = int(found_dict['d'])
+ day = parse_int(found_dict['d'])
elif group_key == 'H':
- hour = int(found_dict['H'])
+ hour = parse_int(found_dict['H'])
elif group_key == 'I':
- hour = int(found_dict['I'])
+ hour = parse_int(found_dict['I'])
ampm = found_dict.get('p', '').lower()
# If there was no AM/PM indicator, we'll treat this like AM
if ampm in ('', locale_time.am_pm[0]):
@@ -516,9 +629,9 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
if hour != 12:
hour += 12
elif group_key == 'M':
- minute = int(found_dict['M'])
+ minute = parse_int(found_dict['M'])
elif group_key == 'S':
- second = int(found_dict['S'])
+ second = parse_int(found_dict['S'])
elif group_key == 'f':
s = found_dict['f']
# Pad to always return microseconds.
@@ -670,18 +783,40 @@ def _strptime_time(data_string, format="%a %b %d %H:%M:%S %Y"):
tt = _strptime(data_string, format)[0]
return time.struct_time(tt[:time._STRUCT_TM_ITEMS])
-def _strptime_datetime(cls, data_string, format="%a %b %d %H:%M:%S %Y"):
- """Return a class cls instance based on the input string and the
+def _strptime_datetime_date(cls, data_string, format="%a %b %d %Y"):
+ """Return a date instance based on the input string and the
+ format string."""
+ tt, _, _ = _strptime(data_string, format)
+ args = tt[:3]
+ return cls(*args)
+
+def _parse_tz(tzname, gmtoff, gmtoff_fraction):
+ tzdelta = datetime_timedelta(seconds=gmtoff, microseconds=gmtoff_fraction)
+ if tzname:
+ return datetime_timezone(tzdelta, tzname)
+ else:
+ return datetime_timezone(tzdelta)
+
+def _strptime_datetime_time(cls, data_string, format="%H:%M:%S"):
+ """Return a time instance based on the input string and the
format string."""
tt, fraction, gmtoff_fraction = _strptime(data_string, format)
tzname, gmtoff = tt[-2:]
- args = tt[:6] + (fraction,)
- if gmtoff is not None:
- tzdelta = datetime_timedelta(seconds=gmtoff, microseconds=gmtoff_fraction)
- if tzname:
- tz = datetime_timezone(tzdelta, tzname)
- else:
- tz = datetime_timezone(tzdelta)
- args += (tz,)
+ args = tt[3:6] + (fraction,)
+ if gmtoff is None:
+ return cls(*args)
+ else:
+ tz = _parse_tz(tzname, gmtoff, gmtoff_fraction)
+ return cls(*args, tz)
- return cls(*args)
+def _strptime_datetime_datetime(cls, data_string, format="%a %b %d %H:%M:%S %Y"):
+ """Return a datetime instance based on the input string and the
+ format string."""
+ tt, fraction, gmtoff_fraction = _strptime(data_string, format)
+ tzname, gmtoff = tt[-2:]
+ args = tt[:6] + (fraction,)
+ if gmtoff is None:
+ return cls(*args)
+ else:
+ tz = _parse_tz(tzname, gmtoff, gmtoff_fraction)
+ return cls(*args, tz)
diff --git a/python/python3_13/pom.xml b/python/python3_14/pom.xml
similarity index 95%
rename from python/python3_13/pom.xml
rename to python/python3_14/pom.xml
index 82ae939c27..bf0e52aac8 100644
--- a/python/python3_13/pom.xml
+++ b/python/python3_14/pom.xml
@@ -1,8 +1,8 @@
4.0.0
- python3-13
+ python3-14
jar
- Python3.13 grammar
+ Python3.14 grammar
org.antlr.grammars
pythonparent