From 4f9a84e9bf3df6c5e1ca1544b1b63fcae85f7f9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B8=D1=85=D0=B0=D0=B8=D0=BB=20=D0=9A=D0=BE=D1=80?= =?UTF-8?q?=D0=BC=D0=B0=D0=BD=D0=BE=D0=B2=D1=81=D0=BA=D0=B8=D0=B9?= Date: Thu, 13 Mar 2025 00:09:42 +0300 Subject: [PATCH 1/6] feat: added C++ port of TypeScript PythonParserBase for Python 3.13 --- python/python3_13/Cpp/PythonLexerBase.cpp | 754 ++++++++++++++++++++++ python/python3_13/Cpp/PythonLexerBase.h | 121 ++++ 2 files changed, 875 insertions(+) create mode 100644 python/python3_13/Cpp/PythonLexerBase.cpp create mode 100644 python/python3_13/Cpp/PythonLexerBase.h diff --git a/python/python3_13/Cpp/PythonLexerBase.cpp b/python/python3_13/Cpp/PythonLexerBase.cpp new file mode 100644 index 0000000000..e35d8ee58a --- /dev/null +++ b/python/python3_13/Cpp/PythonLexerBase.cpp @@ -0,0 +1,754 @@ +#include "PythonLexerBase.h" + +using namespace antlr4; + +// reading the input stream until a return EOF +std::unique_ptr PythonLexerBase::nextToken() { + this->checkNextToken(); + + std::unique_ptr next; + + if (!this->pendingTokens.empty()) + { + next = std::move(*this->pendingTokens.begin()); // add the queued token to the token stream + this->pendingTokens.erase(this->pendingTokens.begin()); + } + + return next; +} + +void PythonLexerBase::reset() { + this->init(); + Lexer::reset(); +} + +std::unique_ptr PythonLexerBase::cloneToken( + const std::unique_ptr &source, + size_t channel, + const std::string &text, + size_t type +) { + return this->_factory->create( + { this, this->_input }, + type, + text, + channel, + source->getStartIndex(), + source->getStopIndex(), + source->getLine(), + source->getCharPositionInLine() + ); +} + +std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr &source, size_t channel) { + return this->cloneToken( + source, + channel, + source->getText(), + source->getType() + ); +} + +std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr &source, const std::string &text) { + return this->cloneToken( + source, + source->getChannel(), + text, + source->getType() + ); +} + +std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr& source) { + return this->_factory->create( + { this, this->_input }, + source->getType(), + source->getText(), + source->getChannel(), + source->getStartIndex(), + source->getStopIndex(), + source->getLine(), + source->getCharPositionInLine() + ); +} + +void PythonLexerBase::init() { + while (!this->indentLengthStack.empty()) { + this->indentLengthStack.pop(); + } + + this->pendingTokens.clear(); + this->previousPendingTokenType = 0; + this->lastPendingTokenTypeFromDefaultChannel = 0; + this->opened = 0; + this->paren_or_bracket_openedStack.clear(); + this->braceExpressionStack.clear(); + this->prevBraceExpression = ""; + this->curLexerMode = 0; + this->lexerModeStack.clear(); + this->wasSpaceIndentation = false; + this->wasTabIndentation = false; + this->wasIndentationMixedWithSpacesAndTabs = false; + this->curToken = nullptr; + this->ffgToken = nullptr; +} + +void PythonLexerBase::checkNextToken() { + if (this->previousPendingTokenType == Token::EOF) { + return; + } + + if (this->indentLengthStack.empty()) { // We're at the first token + this->insertENCODINGtoken(); + this->setCurrentAndFollowingTokens(); + this->handleStartOfInput(); + } else { + this->setCurrentAndFollowingTokens(); + } + + + switch(this->curToken->getType()) { + case PythonLexer::NEWLINE: + this->handleNEWLINEtoken(); + break; + case PythonLexer::LPAR: + case PythonLexer::LSQB: + case PythonLexer::LBRACE: + this->opened++; + this->addPendingToken(this->curToken); + break; + case PythonLexer::RPAR: + case PythonLexer::RSQB: + case PythonLexer::RBRACE: + this->opened--; + this->addPendingToken(this->curToken); + break; + case PythonLexer::FSTRING_MIDDLE: // does not affect the opened field + this->handleFSTRING_MIDDLEtokenWithDoubleBrace(); + this->addPendingToken(this->curToken); + break; + case PythonLexer::COLONEQUAL: + this->handleCOLONEQUALtokenInFString(); + break; + case PythonLexer::ERRORTOKEN: + this->reportLexerError(std::string("token recognition error at: '" + this->curToken->getText() + "'")); + this->addPendingToken(this->curToken); + break; + case PythonLexer::EOF: + this->handleEOFtoken(); + break; + default: + this->addPendingToken(this->curToken); + break; + } + + this->handleFORMAT_SPECIFICATION_MODE(); +} + +void PythonLexerBase::setCurrentAndFollowingTokens() { + if (this->ffgToken) { + this->curToken = this->cloneToken(this->ffgToken); + } else { + this->curToken = PythonLexer::nextToken(); + } + + this->checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)! + + if (this->curToken->getType() == PythonLexer::EOF) { + this->ffgToken = this->cloneToken(this->ffgToken); + } else { + this->ffgToken = PythonLexer::nextToken(); + } +} + +void PythonLexerBase::insertENCODINGtoken() { // https://peps.python.org/pep-0263/ + std::string lineBuilder = ""; + std::string encodingName = ""; + size_t lineCount = 0; + std::regex ws_commentPattern = std::regex("^[ \t\f]*(#.*)?$"); + auto charStream = this->_input; + size_t size = charStream->size(); + charStream->seek(0); + + for(size_t i = 0; i < size; i++) { + auto c = std::to_string(charStream->LA(i + 1)); + lineBuilder += c; + + if (c == "\n" || i == size - 1) { + auto line = std::regex_replace(lineBuilder, std::regex("\r|\n"), ""); + if (std::regex_match(line, ws_commentPattern)) { // https://peps.python.org/pep-0263/ + encodingName = this->getEncodingName(line); + + if (encodingName != "") { + break; // encoding found + } + } else { + break; // statement or backslash found (line is not empty, not whitespace(s), not comment) + } + + lineCount++; + + if (lineCount >= 2) { + break; // check only the first two lines + } + + lineBuilder = ""; + } + } + + if (encodingName == "") { + encodingName = "utf-8"; // default Python source code encoding + } + + std::unique_ptr encodingToken = this->_factory->create( + {this, this->_input}, + PythonLexer::ENCODING, + encodingName, + Token::HIDDEN_CHANNEL, + 0, + 0, + 0, + -1 + ); + + this->addPendingToken(encodingToken); +} + +std::string PythonLexerBase::getEncodingName(const std::string &commentText) { // https://peps.python.org/pep-0263/#defining-the-encoding + std::smatch m; + std::regex encodingCommentPattern("^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)"); + if(std::regex_match(commentText, m, encodingCommentPattern)) + { + return m[1]; + } + return ""; +} + +// initialize the indentLengthStack +// hide the leading NEWLINE token(s) +// if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel +// insert a leading INDENT token if necessary +void PythonLexerBase::handleStartOfInput() { + // initialize the stack with a default 0 indentation length + this->indentLengthStack.push(0); // this will never be popped off + + while (this->curToken->getType() != PythonLexer::EOF) { + if (this->curToken->getChannel() == Token::DEFAULT_CHANNEL) { + if (this->curToken->getType() == PythonLexer::NEWLINE) { + // all the NEWLINE tokens must be ignored before the first statement + this->hideAndAddPendingToken(this->curToken); + } else { // We're at the first statement + this->insertLeadingIndentToken(); + return; // continue the processing of the current token with checkNextToken() + } + } else { + this->addPendingToken(this->curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token + } + this->setCurrentAndFollowingTokens(); + } // continue the processing of the EOF token with checkNextToken() +} + +void PythonLexerBase::insertLeadingIndentToken() { + if (this->previousPendingTokenType == PythonLexer::WS) { + auto prevToken = std::move(*this->pendingTokens.rbegin()); // WS token + + if (this->getIndentationLength(prevToken->getText()) != 0) { // there is an "indentation" before the first statement + std::string errMsg = "first statement indented"; + + this->reportLexerError(errMsg); + + // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser + this->createAndAddPendingToken( + PythonLexer::INDENT, + Token::DEFAULT_CHANNEL, + PythonLexerBase::ERR_TXT + errMsg, + this->curToken + ); + } + } +} + +void PythonLexerBase::handleNEWLINEtoken() { + if (!this->lexerModeStack.empty()) { + this->addPendingToken(this->curToken); + } else if (this->opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token + this->hideAndAddPendingToken(this->curToken); + } else { + auto nlToken = this->cloneToken(this->curToken); // save the current NEWLINE token + bool isLookingAhead = this->ffgToken->getType() == PythonLexer::WS; + + if (isLookingAhead) { + this->setCurrentAndFollowingTokens(); // set the next two tokens + } + + switch (this->ffgToken->getType()) { + case PythonLexer::NEWLINE: // We're before a blank line + case PythonLexer::COMMENT: // We're before a comment + this->hideAndAddPendingToken(std::move(nlToken)); + if (isLookingAhead) { + this->addPendingToken(this->curToken); // WS token + } + break; + default: + this->addPendingToken(std::move(nlToken)); + if (isLookingAhead) { // We're on whitespace(s) followed by a statement + auto indentationLength = this->ffgToken->getType() == PythonLexer::EOF ? + 0 : this->getIndentationLength(this->curToken->getText()); + + if (indentationLength != PythonLexerBase::INVALID_LENGTH) { + this->addPendingToken(this->curToken); // WS token + this->insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s) + } else { + this->reportError("inconsistent use of tabs and spaces in indentation"); + } + } else { // We're at a newline followed by a statement (there is no whitespace before the statement) + this->insertIndentOrDedentToken(0); // may insert DEDENT token(s) + } + } + } +} + +void PythonLexerBase::insertIndentOrDedentToken(size_t indentLength) { + auto prevIndentLength = this->indentLengthStack.top(); + + if (indentLength > prevIndentLength) { + this->createAndAddPendingToken(PythonLexer::INDENT, Token::DEFAULT_CHANNEL, this->ffgToken); + this->indentLengthStack.push(indentLength); + } else { + while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream + this->indentLengthStack.pop(); + prevIndentLength = this->indentLengthStack.top(); + + if (indentLength <= prevIndentLength) { + this->createAndAddPendingToken(PythonLexer::DEDENT, Token::DEFAULT_CHANNEL, this->ffgToken); + } else { + this->reportError("inconsistent dedent"); + } + } + } +} + +void PythonLexerBase::checkCurToken() { + switch (this->curToken->getType()) { + case PythonLexer::FSTRING_START: + this->setLexerModeByFSTRING_STARTtoken(); + return; + case PythonLexer::FSTRING_MIDDLE: + this->handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); // affect the opened field + if (this->curToken->getType() == PythonLexer::FSTRING_MIDDLE) { + return; + } + break; + case PythonLexer::FSTRING_END: + this->popLexerMode(); + return; + default: + if (this->lexerModeStack.empty()) { + return; + } + } + + switch (this->curToken->getType()) { // the following tokens can only come from default mode (after an LBRACE in fstring) + case PythonLexer::NEWLINE: + // append the current brace expression with the current newline + this->appendToBraceExpression(this->curToken->getText()); + this->curToken = this->cloneToken(this->curToken, Token::HIDDEN_CHANNEL); + break; + case PythonLexer::LBRACE: + // the outermost brace expression cannot be a dictionary comprehension or a set comprehension + this->braceExpressionStack.push_back("{"); + this->paren_or_bracket_openedStack.push_back(0); + this->pushLexerMode(Lexer::DEFAULT_MODE); + break; + case PythonLexer::LPAR: + case PythonLexer::LSQB: + // append the current brace expression with a "(" or a "[" + this->appendToBraceExpression(this->curToken->getText()); + // https://peps.python.org/pep-0498/#lambdas-inside-expressions + this->incrementBraceStack(); + break; + case PythonLexer::RPAR: + case PythonLexer::RSQB: + // append the current brace expression with a ")" or a "]" + this->appendToBraceExpression(this->curToken->getText()); + this->decrementBraceStack(); + break; + case PythonLexer::COLON: + case PythonLexer::COLONEQUAL: + // append the current brace expression with a ":" or a ":=" + this->appendToBraceExpression(this->curToken->getText()); + this->setLexerModeByCOLONorCOLONEQUALtoken(); + break; + case PythonLexer::RBRACE: + this->setLexerModeAfterRBRACEtoken(); + break; + default: + // append the current brace expression with the current token text + this->appendToBraceExpression(this->curToken->getText()); + } +} + +void PythonLexerBase::appendToBraceExpression(const std::string &text) { + *this->braceExpressionStack.rbegin() += text; +} + +void PythonLexerBase::incrementBraceStack() { // increment the last element (peek() + 1) + (*this->paren_or_bracket_openedStack.rbegin())++; +} + +void PythonLexerBase::decrementBraceStack() { // decrement the last element (peek() - 1) + (*this->paren_or_bracket_openedStack.rbegin())--; +} + +void PythonLexerBase::setLexerModeAfterRBRACEtoken() { + switch (this->curLexerMode) { + case Lexer::DEFAULT_MODE: + this->popLexerMode(); + this->popByBRACE(); + break; + case PythonLexer::SQ1__FORMAT_SPECIFICATION_MODE: + case PythonLexer::SQ1R_FORMAT_SPECIFICATION_MODE: + case PythonLexer::DQ1__FORMAT_SPECIFICATION_MODE: + case PythonLexer::DQ1R_FORMAT_SPECIFICATION_MODE: + case PythonLexer::SQ3__FORMAT_SPECIFICATION_MODE: + case PythonLexer::SQ3R_FORMAT_SPECIFICATION_MODE: + case PythonLexer::DQ3__FORMAT_SPECIFICATION_MODE: + case PythonLexer::DQ3R_FORMAT_SPECIFICATION_MODE: + this->popLexerMode(); + this->popLexerMode(); + this->popByBRACE(); + break; + default: + this->reportLexerError("f-string: single '}' is not allowed"); + } +} + +void PythonLexerBase::setLexerModeByFSTRING_STARTtoken() { + std::string curTokenText = this->curToken->getText(); + auto text = curTokenText; + std::transform(text.cbegin(), text.cend(), text.begin(), [](auto ch) { return std::tolower(ch); }); + std::map modeMap = { + {"f'", PythonLexer::SQ1__FSTRING_MODE}, + {"rf'", PythonLexer::SQ1R_FSTRING_MODE}, + {"fr'", PythonLexer::SQ1R_FSTRING_MODE}, + {"f\"", PythonLexer::DQ1__FSTRING_MODE}, + {"rf\"", PythonLexer::DQ1R_FSTRING_MODE}, + {"fr\"", PythonLexer::DQ1R_FSTRING_MODE}, + {"f'''", PythonLexer::SQ3__FSTRING_MODE}, + {"rf'''", PythonLexer::SQ3R_FSTRING_MODE}, + {"fr'''", PythonLexer::SQ3R_FSTRING_MODE}, + {"f\"\"\"", PythonLexer::DQ3__FSTRING_MODE}, + {"rf\"\"\"", PythonLexer::DQ3R_FSTRING_MODE}, + {"fr\"\"\"", PythonLexer::DQ3R_FSTRING_MODE}, + }; + + if (modeMap.find(text) != modeMap.end()) { + this->pushLexerMode(modeMap[text]); + } +} + +void PythonLexerBase::setLexerModeByCOLONorCOLONEQUALtoken() { + if (*this->paren_or_bracket_openedStack.rbegin() == 0) { // stack peek == 0 + auto previousMode = *this->lexerModeStack.rbegin(); // stack peek + switch (previousMode) { // check the previous lexer mode (the current is DEFAULT_MODE) + case PythonLexer::SQ1__FSTRING_MODE: + case PythonLexer::SQ1__FORMAT_SPECIFICATION_MODE: + this->pushLexerMode(PythonLexer::SQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer::SQ1R_FSTRING_MODE: + case PythonLexer::SQ1R_FORMAT_SPECIFICATION_MODE: + this->pushLexerMode(PythonLexer::SQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer::DQ1__FSTRING_MODE: + case PythonLexer::DQ1__FORMAT_SPECIFICATION_MODE: + this->pushLexerMode(PythonLexer::DQ1__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer::DQ1R_FSTRING_MODE: + case PythonLexer::DQ1R_FORMAT_SPECIFICATION_MODE: + this->pushLexerMode(PythonLexer::DQ1R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer::SQ3__FSTRING_MODE: + case PythonLexer::SQ3__FORMAT_SPECIFICATION_MODE: + this->pushLexerMode(PythonLexer::SQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer::SQ3R_FSTRING_MODE: + case PythonLexer::SQ3R_FORMAT_SPECIFICATION_MODE: + this->pushLexerMode(PythonLexer::SQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer::DQ3__FSTRING_MODE: + case PythonLexer::DQ3__FORMAT_SPECIFICATION_MODE: + this->pushLexerMode(PythonLexer::DQ3__FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + case PythonLexer::DQ3R_FSTRING_MODE: + case PythonLexer::DQ3R_FORMAT_SPECIFICATION_MODE: + this->pushLexerMode(PythonLexer::DQ3R_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode + break; + } + } +} + +void PythonLexerBase::popByBRACE() { + this->paren_or_bracket_openedStack.pop_back(); + this->prevBraceExpression = *this->braceExpressionStack.rbegin() + "}"; + this->braceExpressionStack.pop_back(); + + if (this->braceExpressionStack.size() > 0) { + // append the current brace expression with the previous brace expression + (*this->braceExpressionStack.rbegin()) += this->prevBraceExpression; + } +} + +void PythonLexerBase::handleFSTRING_MIDDLEtokenWithDoubleBrace() { + // Replace the trailing double brace with a single brace and insert a hidden brace token + auto lastTwoChars = this->getLastTwoCharsOfTheCurTokenText(); + + if (lastTwoChars == "{{") { + this->trimLastCharAddPendingTokenSetCurToken(PythonLexer::LBRACE, "{", Token::HIDDEN_CHANNEL); + } else if (lastTwoChars == "}}") { + this->trimLastCharAddPendingTokenSetCurToken(PythonLexer::RBRACE, "}", Token::HIDDEN_CHANNEL); + } +} + +void PythonLexerBase::handleFSTRING_MIDDLEtokenWithQuoteAndLBrace() { + // Replace the trailing quote + left_brace with a quote and insert an LBRACE token + // Replace the trailing backslash + left_brace with a backslash and insert an LBRACE token + auto lastTwoChars = this->getLastTwoCharsOfTheCurTokenText(); + + if (lastTwoChars == "\"{" || lastTwoChars == "'{" || lastTwoChars == "\\{") { + this->trimLastCharAddPendingTokenSetCurToken(PythonLexer::LBRACE, "{", Token::DEFAULT_CHANNEL); + } +} + +std::string PythonLexerBase::getLastTwoCharsOfTheCurTokenText() { + return this->curToken->getText().substr(-2); +} + +void PythonLexerBase::trimLastCharAddPendingTokenSetCurToken( + size_t number, + const std::string &text, + size_t channel +) { + // Trim the last char and add the modified curToken to the pendingTokens stack + auto tokenTextWithoutLastChar = this->curToken->getText().substr(0, -1); + + this->addPendingToken(this->cloneToken(this->curToken, tokenTextWithoutLastChar)); + + this->createNewCurToken(type, text, channel); // Set curToken +} + +void PythonLexerBase::handleCOLONEQUALtokenInFString() { + if ( + this->lexerModeStack.size() > 0 && + *this->paren_or_bracket_openedStack.rbegin() == 0 // stack peek == 0 + ) { + // In fstring, a colonequal (walrus operator) can only be used in parentheses + // Not in parentheses, replace COLONEQUAL token with COLON as format specifier + // and insert the equal symbol to the following FSTRING_MIDDLE token + + this->curToken = this->cloneToken(this->curToken, PythonLexer::COLON, ":", channel); + + if (this->ffgToken->getType() == PythonLexer::FSTRING_MIDDLE) { + + this->ffgToken = this->_factory->create( + {this, this->_input}, + this->ffgToken->getType(), + "=" + this->ffgToken->getText(), + channel, + this->ffgToken->getStartIndex() - 1, + this->ffgToken->getStartIndex(), + this->ffgToken->getLine(), + this->ffgToken->getCharPositionInLine() - 1 + ); + } else { + this->addPendingToken(this->curToken); + this->createNewCurToken(PythonLexer::FSTRING_MIDDLE, "=", Token::DEFAULT_CHANNEL); + } + } + + this->addPendingToken(this->curToken); +} + +void PythonLexerBase::createNewCurToken( + size_t type, + const std::string &text, + size_t channel) +{ + this->curToken = std::move(this->_factory->create( + {this, this->_input}, + type, + text, + channel, + this->curToken->getStartIndex() + 1, + this->curToken->getStartIndex(), + this->curToken->getLine(), + this->curToken->getCharPositionInLine() + )); +} + +void PythonLexerBase::pushLexerMode(size_t mode) { + this->pushMode(mode); + this->lexerModeStack.push_back(this->curLexerMode); + this->curLexerMode = mode; +} + +void PythonLexerBase::popLexerMode() { + this->popMode(); + this->curLexerMode = *this->lexerModeStack.rbegin(); + this->lexerModeStack.pop_back(); +} + +void PythonLexerBase::handleFORMAT_SPECIFICATION_MODE() { + if (this->lexerModeStack.size() > 0 && + this->ffgToken->getType() == PythonLexer::RBRACE) { + + // insert an empty FSTRING_MIDDLE token instead of the missing format specification + switch (this->curToken->getType()) { + case PythonLexer::COLON: + this->createAndAddPendingToken(PythonLexer::FSTRING_MIDDLE, Token::DEFAULT_CHANNEL, "", this->ffgToken); + break; + case PythonLexer::RBRACE: + // only if the previous brace expression is not a dictionary comprehension or set comprehension + if (!this->isDictionaryComprehensionOrSetComprehension(this->prevBraceExpression)) { + this->createAndAddPendingToken(PythonLexer::FSTRING_MIDDLE, Token::DEFAULT_CHANNEL, "", this->ffgToken); + } + break; + } + } +} + +bool PythonLexerBase::isDictionaryComprehensionOrSetComprehension(const std::string &code) { + auto inputStream = std::make_unique(code); + auto lexer = std::make_unique(inputStream.get()); + auto tokenStream = std::make_unique(lexer.get()); + auto parser = std::make_unique(tokenStream.get()); + + // Disable error listeners to suppress console output + lexer->removeErrorListeners(); + parser->removeErrorListeners(); + + parser->dictcomp(); // Try parsing as dictionary comprehension + if (parser->getNumberOfSyntaxErrors() == 0) + return true; + + parser = std::make_unique(tokenStream.get()); + + tokenStream->seek(0); + + parser->removeErrorListeners(); + parser->setcomp(); // Try parsing as set comprehension + return parser->getNumberOfSyntaxErrors() == 0; +} + +void PythonLexerBase::insertTrailingTokens() { + switch (this->lastPendingTokenTypeFromDefaultChannel) { + case PythonLexer::NEWLINE: + case PythonLexer::DEDENT: + break; // no trailing NEWLINE token is needed + default: + // insert an extra trailing NEWLINE token that serves as the end of the last statement + this->createAndAddPendingToken(PythonLexer::NEWLINE, Token::DEFAULT_CHANNEL, this->ffgToken); // ffgToken is EOF + break; + } + this->insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed +} + +void PythonLexerBase::handleEOFtoken() { + if (this->lastPendingTokenTypeFromDefaultChannel > 0) { + // there was a statement in the input (leading NEWLINE tokens are hidden) + this->insertTrailingTokens(); + } + this->addPendingToken(this->curToken); +} + +void PythonLexerBase::hideAndAddPendingToken(const std::unique_ptr &tkn) { + this->addPendingToken(this->cloneToken(tkn, Token::HIDDEN_CHANNEL)); +} + +void PythonLexerBase::createAndAddPendingToken( + size_t type, + size_t channel, + const std::string &text, + const std::unique_ptr &sampleToken +) { + this->addPendingToken( + this->_factory->create( + {this, this->_input}, + type, + text, + channel, + sampleToken->getStartIndex(), + sampleToken->getStartIndex() - 1, + sampleToken->getLine(), + sampleToken->getCharPositionInLine() + ) + ); +} + +void PythonLexerBase::createAndAddPendingToken( + size_t type, + size_t channel, + const std::unique_ptr &sampleToken +) { + this->createAndAddPendingToken( + type, + channel, + "<$" + this->getVocabulary().getDisplayName(type) + ">", + sampleToken + ); +} + +void PythonLexerBase::addPendingToken(const std::unique_ptr &tkn) { + // save the last pending token type because the pendingTokens list can be empty by the nextToken() + this->previousPendingTokenType = tkn->getType(); + if (tkn->getChannel() == Token::DEFAULT_CHANNEL) { + this->lastPendingTokenTypeFromDefaultChannel = this->previousPendingTokenType; + } + + this->pendingTokens.push_back(this->cloneToken(tkn)) /* .addLast(token) */; +} + +size_t PythonLexerBase::getIndentationLength(const std::string &indentText) { // the indentText may contain spaces, tabs or form feeds + const size_t TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces + size_t length = 0; + for (const auto &ch : indentText) { + switch (ch) { + case ' ': + this->wasSpaceIndentation = true; + length += 1; + break; + case '\t': + this->wasTabIndentation = true; + length += TAB_LENGTH - (length % TAB_LENGTH); + break; + case '\f': // form feed + length = 0; + break; + } + } + + if (this->wasTabIndentation && this->wasSpaceIndentation) { + if (!this->wasIndentationMixedWithSpacesAndTabs) { + this->wasIndentationMixedWithSpacesAndTabs = true; + length = PythonLexerBase::INVALID_LENGTH; // only for the first inconsistent indent + } + } + + return length; +} + +void PythonLexerBase::reportLexerError(const std::string &errMsg) { + this->getErrorListenerDispatch().syntaxError( + this, + 0 /* this->curToken */, + this->curToken->getLine(), + this->curToken->getCharPositionInLine(), + " LEXER" + PythonLexerBase::ERR_TXT + errMsg, + nullptr + ); +} + +void PythonLexerBase::reportError(const std::string &errMsg) { + this->reportLexerError(errMsg); + + // the ERRORTOKEN will raise an error in the parser + this->createAndAddPendingToken(PythonLexer::ERRORTOKEN, Token::DEFAULT_CHANNEL, PythonLexerBase::ERR_TXT + errMsg, this->ffgToken); +} diff --git a/python/python3_13/Cpp/PythonLexerBase.h b/python/python3_13/Cpp/PythonLexerBase.h new file mode 100644 index 0000000000..24b6337242 --- /dev/null +++ b/python/python3_13/Cpp/PythonLexerBase.h @@ -0,0 +1,121 @@ +/* +The MIT License (MIT) +Copyright (c) 2021 Robert Einhorn + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + */ + +/* + * + * Project : Python Indent/Dedent handler for ANTLR4 grammars + * + * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com + * + */ + +#pragma once + +#include +#include +#include +#include +#include +#include "antlr4-runtime.h" +#include "PythonLexer.h" +#include "PythonParser.h" + +using namespace antlr4; + +class PythonLexerBase : public PythonLexer { +public: + explicit PythonLexerBase(CharStream *input): PythonLexer(input) { + this->init(); + } + virtual std::unique_ptr nextToken() override; + virtual void reset() override; +private: + std::unique_ptr cloneToken(const std::unique_ptr &source); + std::unique_ptr cloneToken(const std::unique_ptr &source, size_t channel); + std::unique_ptr cloneToken(const std::unique_ptr &source, const std::string &text); + std::unique_ptr cloneToken(const std::unique_ptr &source, size_t channel, const std::string &text, size_t type); + void init(); + void checkNextToken(); + void setCurrentAndFollowingTokens(); + void insertENCODINGtoken(); + std::string getEncodingName(const std::string &commentText); + void handleStartOfInput(); + void insertLeadingIndentToken(); + void handleNEWLINEtoken(); + void insertIndentOrDedentToken(size_t indentLength); + void checkCurToken(); + void appendToBraceExpression(const std::string &text); + void incrementBraceStack(); + void decrementBraceStack(); + void setLexerModeAfterRBRACEtoken(); + void setLexerModeByFSTRING_STARTtoken(); + void setLexerModeByCOLONorCOLONEQUALtoken(); + void popByBRACE(); + void handleFSTRING_MIDDLEtokenWithDoubleBrace(); + void handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); + std::string getLastTwoCharsOfTheCurTokenText(); + void trimLastCharAddPendingTokenSetCurToken(size_t type, const std::string &text, size_t channel); + void handleCOLONEQUALtokenInFString(); + void createNewCurToken(size_t type, const std::string &text, size_t channel); + void pushLexerMode(size_t mode); + void popLexerMode(); + void handleFORMAT_SPECIFICATION_MODE(); + bool isDictionaryComprehensionOrSetComprehension(const std::string &code); + void insertTrailingTokens(); + void handleEOFtoken(); + void hideAndAddPendingToken(const std::unique_ptr &token); + void createAndAddPendingToken(size_t type, size_t channel, const std::string &text, const std::unique_ptr &sampleToken); + void createAndAddPendingToken(size_t type, size_t channel, const std::unique_ptr &sampleToken); + void addPendingToken(const std::unique_ptr &token); + size_t getIndentationLength(const std::string &identText); + void reportLexerError(const std::string &errMsg); + void reportError(const std::string &errMsg); + + // A stack that keeps track of the indentation lengths + std::stack indentLengthStack; + // A list where tokens are waiting to be loaded into the token stream + std::vector> pendingTokens; + // last pending token types + size_t previousPendingTokenType; + size_t lastPendingTokenTypeFromDefaultChannel; + + // The amount of opened parentheses, square brackets or curly braces + size_t opened; + // The amount of opened parentheses and square brackets in the current lexer mode + std::vector paren_or_bracket_openedStack; + // A stack that stores expression(s) between braces in fstring + std::vector braceExpressionStack; + std::string prevBraceExpression; + + // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime) + size_t curLexerMode; + // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime) + std::vector lexerModeStack; + bool wasSpaceIndentation; + bool wasTabIndentation; + bool wasIndentationMixedWithSpacesAndTabs; + + std::unique_ptr curToken; // current (under processing) token + std::unique_ptr ffgToken; // following (look ahead) token + + const ssize_t INVALID_LENGTH = -1; + const std::string ERR_TXT = " ERROR: "; +}; \ No newline at end of file From 65a23a046b43f853dfbc89cfe73b736ee6968f92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B8=D1=85=D0=B0=D0=B8=D0=BB=20=D0=9A=D0=BE=D1=80?= =?UTF-8?q?=D0=BC=D0=B0=D0=BD=D0=BE=D0=B2=D1=81=D0=BA=D0=B8=D0=B9?= Date: Fri, 21 Mar 2025 17:25:47 +0300 Subject: [PATCH 2/6] chore: add Cpp to targets --- python/python3_13/desc.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/python3_13/desc.xml b/python/python3_13/desc.xml index 8aa6fdea92..78cc3dad3b 100644 --- a/python/python3_13/desc.xml +++ b/python/python3_13/desc.xml @@ -1,9 +1,9 @@ ^4.13.2 - CSharp;Java;Python3;JavaScript;TypeScript + Cpp;CSharp;Java;Python3;JavaScript;TypeScript - CSharp;Java;Python3;JavaScript;TypeScript + Cpp;CSharp;Java;Python3;JavaScript;TypeScript file_input examples From 47590086beda8213ec02275600a7dfdac118fb64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B8=D1=85=D0=B0=D0=B8=D0=BB=20=D0=9A=D0=BE=D1=80?= =?UTF-8?q?=D0=BC=D0=B0=D0=BD=D0=BE=D0=B2=D1=81=D0=BA=D0=B8=D0=B9?= Date: Fri, 21 Mar 2025 18:04:46 +0300 Subject: [PATCH 3/6] chore: make PythonLexerBase depend on antlr4::Lexer and remoove using stmts --- python/python3_13/Cpp/PythonLexerBase.cpp | 54 +++---- python/python3_13/Cpp/PythonLexerBase.h | 180 +++++++++++----------- 2 files changed, 116 insertions(+), 118 deletions(-) diff --git a/python/python3_13/Cpp/PythonLexerBase.cpp b/python/python3_13/Cpp/PythonLexerBase.cpp index e35d8ee58a..9fce8af49a 100644 --- a/python/python3_13/Cpp/PythonLexerBase.cpp +++ b/python/python3_13/Cpp/PythonLexerBase.cpp @@ -1,12 +1,12 @@ #include "PythonLexerBase.h" - -using namespace antlr4; +#include "PythonLexer.h" +#include "PythonParser.h" // reading the input stream until a return EOF -std::unique_ptr PythonLexerBase::nextToken() { +std::unique_ptr PythonLexerBase::nextToken() { this->checkNextToken(); - std::unique_ptr next; + std::unique_ptr next; if (!this->pendingTokens.empty()) { @@ -22,8 +22,8 @@ void PythonLexerBase::reset() { Lexer::reset(); } -std::unique_ptr PythonLexerBase::cloneToken( - const std::unique_ptr &source, +std::unique_ptr PythonLexerBase::cloneToken( + const std::unique_ptr &source, size_t channel, const std::string &text, size_t type @@ -40,7 +40,7 @@ std::unique_ptr PythonLexerBase::cloneToken( ); } -std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr &source, size_t channel) { +std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr &source, size_t channel) { return this->cloneToken( source, channel, @@ -49,7 +49,7 @@ std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr ); } -std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr &source, const std::string &text) { +std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr &source, const std::string &text) { return this->cloneToken( source, source->getChannel(), @@ -58,7 +58,7 @@ std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr ); } -std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr& source) { +std::unique_ptr PythonLexerBase::cloneToken(const std::unique_ptr& source) { return this->_factory->create( { this, this->_input }, source->getType(), @@ -93,7 +93,7 @@ void PythonLexerBase::init() { } void PythonLexerBase::checkNextToken() { - if (this->previousPendingTokenType == Token::EOF) { + if (this->previousPendingTokenType == antlr4::Token::EOF) { return; } @@ -154,7 +154,7 @@ void PythonLexerBase::setCurrentAndFollowingTokens() { this->checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)! if (this->curToken->getType() == PythonLexer::EOF) { - this->ffgToken = this->cloneToken(this->ffgToken); + this->ffgToken = this->cloneToken(this->curToken); } else { this->ffgToken = PythonLexer::nextToken(); } @@ -199,11 +199,11 @@ void PythonLexerBase::insertENCODINGtoken() { // https://peps.python.org/pep-026 encodingName = "utf-8"; // default Python source code encoding } - std::unique_ptr encodingToken = this->_factory->create( + std::unique_ptr encodingToken = this->_factory->create( {this, this->_input}, PythonLexer::ENCODING, encodingName, - Token::HIDDEN_CHANNEL, + antlr4::Token::HIDDEN_CHANNEL, 0, 0, 0, @@ -232,7 +232,7 @@ void PythonLexerBase::handleStartOfInput() { this->indentLengthStack.push(0); // this will never be popped off while (this->curToken->getType() != PythonLexer::EOF) { - if (this->curToken->getChannel() == Token::DEFAULT_CHANNEL) { + if (this->curToken->getChannel() == antlr4::Token::DEFAULT_CHANNEL) { if (this->curToken->getType() == PythonLexer::NEWLINE) { // all the NEWLINE tokens must be ignored before the first statement this->hideAndAddPendingToken(this->curToken); @@ -514,7 +514,7 @@ void PythonLexerBase::handleFSTRING_MIDDLEtokenWithQuoteAndLBrace() { auto lastTwoChars = this->getLastTwoCharsOfTheCurTokenText(); if (lastTwoChars == "\"{" || lastTwoChars == "'{" || lastTwoChars == "\\{") { - this->trimLastCharAddPendingTokenSetCurToken(PythonLexer::LBRACE, "{", Token::DEFAULT_CHANNEL); + this->trimLastCharAddPendingTokenSetCurToken(PythonLexer::LBRACE, "{", antlr4::Token::DEFAULT_CHANNEL); } } @@ -560,7 +560,7 @@ void PythonLexerBase::handleCOLONEQUALtokenInFString() { ); } else { this->addPendingToken(this->curToken); - this->createNewCurToken(PythonLexer::FSTRING_MIDDLE, "=", Token::DEFAULT_CHANNEL); + this->createNewCurToken(PythonLexer::FSTRING_MIDDLE, "=", antlr4::Token::DEFAULT_CHANNEL); } } @@ -603,12 +603,12 @@ void PythonLexerBase::handleFORMAT_SPECIFICATION_MODE() { // insert an empty FSTRING_MIDDLE token instead of the missing format specification switch (this->curToken->getType()) { case PythonLexer::COLON: - this->createAndAddPendingToken(PythonLexer::FSTRING_MIDDLE, Token::DEFAULT_CHANNEL, "", this->ffgToken); + this->createAndAddPendingToken(PythonLexer::FSTRING_MIDDLE, antlr4::Token::DEFAULT_CHANNEL, "", this->ffgToken); break; case PythonLexer::RBRACE: // only if the previous brace expression is not a dictionary comprehension or set comprehension if (!this->isDictionaryComprehensionOrSetComprehension(this->prevBraceExpression)) { - this->createAndAddPendingToken(PythonLexer::FSTRING_MIDDLE, Token::DEFAULT_CHANNEL, "", this->ffgToken); + this->createAndAddPendingToken(PythonLexer::FSTRING_MIDDLE, antlr4::Token::DEFAULT_CHANNEL, "", this->ffgToken); } break; } @@ -616,7 +616,7 @@ void PythonLexerBase::handleFORMAT_SPECIFICATION_MODE() { } bool PythonLexerBase::isDictionaryComprehensionOrSetComprehension(const std::string &code) { - auto inputStream = std::make_unique(code); + auto inputStream = std::make_unique(code); auto lexer = std::make_unique(inputStream.get()); auto tokenStream = std::make_unique(lexer.get()); auto parser = std::make_unique(tokenStream.get()); @@ -645,7 +645,7 @@ void PythonLexerBase::insertTrailingTokens() { break; // no trailing NEWLINE token is needed default: // insert an extra trailing NEWLINE token that serves as the end of the last statement - this->createAndAddPendingToken(PythonLexer::NEWLINE, Token::DEFAULT_CHANNEL, this->ffgToken); // ffgToken is EOF + this->createAndAddPendingToken(PythonLexer::NEWLINE, antlr4::Token::DEFAULT_CHANNEL, this->ffgToken); // ffgToken is EOF break; } this->insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed @@ -659,15 +659,15 @@ void PythonLexerBase::handleEOFtoken() { this->addPendingToken(this->curToken); } -void PythonLexerBase::hideAndAddPendingToken(const std::unique_ptr &tkn) { - this->addPendingToken(this->cloneToken(tkn, Token::HIDDEN_CHANNEL)); +void PythonLexerBase::hideAndAddPendingToken(const std::unique_ptr &tkn) { + this->addPendingToken(this->cloneToken(tkn, antlr4::Token::HIDDEN_CHANNEL)); } void PythonLexerBase::createAndAddPendingToken( size_t type, size_t channel, const std::string &text, - const std::unique_ptr &sampleToken + const std::unique_ptr &sampleToken ) { this->addPendingToken( this->_factory->create( @@ -686,7 +686,7 @@ void PythonLexerBase::createAndAddPendingToken( void PythonLexerBase::createAndAddPendingToken( size_t type, size_t channel, - const std::unique_ptr &sampleToken + const std::unique_ptr &sampleToken ) { this->createAndAddPendingToken( type, @@ -696,10 +696,10 @@ void PythonLexerBase::createAndAddPendingToken( ); } -void PythonLexerBase::addPendingToken(const std::unique_ptr &tkn) { +void PythonLexerBase::addPendingToken(const std::unique_ptr &tkn) { // save the last pending token type because the pendingTokens list can be empty by the nextToken() this->previousPendingTokenType = tkn->getType(); - if (tkn->getChannel() == Token::DEFAULT_CHANNEL) { + if (tkn->getChannel() == antlr4::Token::DEFAULT_CHANNEL) { this->lastPendingTokenTypeFromDefaultChannel = this->previousPendingTokenType; } @@ -750,5 +750,5 @@ void PythonLexerBase::reportError(const std::string &errMsg) { this->reportLexerError(errMsg); // the ERRORTOKEN will raise an error in the parser - this->createAndAddPendingToken(PythonLexer::ERRORTOKEN, Token::DEFAULT_CHANNEL, PythonLexerBase::ERR_TXT + errMsg, this->ffgToken); + this->createAndAddPendingToken(PythonLexer::ERRORTOKEN, antlr4::Token::DEFAULT_CHANNEL, PythonLexerBase::ERR_TXT + errMsg, this->ffgToken); } diff --git a/python/python3_13/Cpp/PythonLexerBase.h b/python/python3_13/Cpp/PythonLexerBase.h index 24b6337242..761cf5b400 100644 --- a/python/python3_13/Cpp/PythonLexerBase.h +++ b/python/python3_13/Cpp/PythonLexerBase.h @@ -27,95 +27,93 @@ THE SOFTWARE. * */ -#pragma once + #pragma once -#include -#include -#include -#include -#include -#include "antlr4-runtime.h" -#include "PythonLexer.h" -#include "PythonParser.h" - -using namespace antlr4; - -class PythonLexerBase : public PythonLexer { -public: - explicit PythonLexerBase(CharStream *input): PythonLexer(input) { - this->init(); - } - virtual std::unique_ptr nextToken() override; - virtual void reset() override; -private: - std::unique_ptr cloneToken(const std::unique_ptr &source); - std::unique_ptr cloneToken(const std::unique_ptr &source, size_t channel); - std::unique_ptr cloneToken(const std::unique_ptr &source, const std::string &text); - std::unique_ptr cloneToken(const std::unique_ptr &source, size_t channel, const std::string &text, size_t type); - void init(); - void checkNextToken(); - void setCurrentAndFollowingTokens(); - void insertENCODINGtoken(); - std::string getEncodingName(const std::string &commentText); - void handleStartOfInput(); - void insertLeadingIndentToken(); - void handleNEWLINEtoken(); - void insertIndentOrDedentToken(size_t indentLength); - void checkCurToken(); - void appendToBraceExpression(const std::string &text); - void incrementBraceStack(); - void decrementBraceStack(); - void setLexerModeAfterRBRACEtoken(); - void setLexerModeByFSTRING_STARTtoken(); - void setLexerModeByCOLONorCOLONEQUALtoken(); - void popByBRACE(); - void handleFSTRING_MIDDLEtokenWithDoubleBrace(); - void handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); - std::string getLastTwoCharsOfTheCurTokenText(); - void trimLastCharAddPendingTokenSetCurToken(size_t type, const std::string &text, size_t channel); - void handleCOLONEQUALtokenInFString(); - void createNewCurToken(size_t type, const std::string &text, size_t channel); - void pushLexerMode(size_t mode); - void popLexerMode(); - void handleFORMAT_SPECIFICATION_MODE(); - bool isDictionaryComprehensionOrSetComprehension(const std::string &code); - void insertTrailingTokens(); - void handleEOFtoken(); - void hideAndAddPendingToken(const std::unique_ptr &token); - void createAndAddPendingToken(size_t type, size_t channel, const std::string &text, const std::unique_ptr &sampleToken); - void createAndAddPendingToken(size_t type, size_t channel, const std::unique_ptr &sampleToken); - void addPendingToken(const std::unique_ptr &token); - size_t getIndentationLength(const std::string &identText); - void reportLexerError(const std::string &errMsg); - void reportError(const std::string &errMsg); - - // A stack that keeps track of the indentation lengths - std::stack indentLengthStack; - // A list where tokens are waiting to be loaded into the token stream - std::vector> pendingTokens; - // last pending token types - size_t previousPendingTokenType; - size_t lastPendingTokenTypeFromDefaultChannel; - - // The amount of opened parentheses, square brackets or curly braces - size_t opened; - // The amount of opened parentheses and square brackets in the current lexer mode - std::vector paren_or_bracket_openedStack; - // A stack that stores expression(s) between braces in fstring - std::vector braceExpressionStack; - std::string prevBraceExpression; - - // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime) - size_t curLexerMode; - // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime) - std::vector lexerModeStack; - bool wasSpaceIndentation; - bool wasTabIndentation; - bool wasIndentationMixedWithSpacesAndTabs; - - std::unique_ptr curToken; // current (under processing) token - std::unique_ptr ffgToken; // following (look ahead) token - - const ssize_t INVALID_LENGTH = -1; - const std::string ERR_TXT = " ERROR: "; -}; \ No newline at end of file + #include + #include + #include + #include + #include + + #include "antlr4-runtime.h" + + class PythonLexerBase : public antlr4::Lexer { + public: + explicit PythonLexerBase(antlr4::CharStream *input): antlr4::Lexer(input) { + this->init(); + } + virtual std::unique_ptr nextToken() override; + virtual void reset() override; + private: + std::unique_ptr cloneToken(const std::unique_ptr &source); + std::unique_ptr cloneToken(const std::unique_ptr &source, size_t channel); + std::unique_ptr cloneToken(const std::unique_ptr &source, const std::string &text); + std::unique_ptr cloneToken(const std::unique_ptr &source, size_t channel, const std::string &text, size_t type); + void init(); + void checkNextToken(); + void setCurrentAndFollowingTokens(); + void insertENCODINGtoken(); + std::string getEncodingName(const std::string &commentText); + void handleStartOfInput(); + void insertLeadingIndentToken(); + void handleNEWLINEtoken(); + void insertIndentOrDedentToken(size_t indentLength); + void checkCurToken(); + void appendToBraceExpression(const std::string &text); + void incrementBraceStack(); + void decrementBraceStack(); + void setLexerModeAfterRBRACEtoken(); + void setLexerModeByFSTRING_STARTtoken(); + void setLexerModeByCOLONorCOLONEQUALtoken(); + void popByBRACE(); + void handleFSTRING_MIDDLEtokenWithDoubleBrace(); + void handleFSTRING_MIDDLEtokenWithQuoteAndLBrace(); + std::string getLastTwoCharsOfTheCurTokenText(); + void trimLastCharAddPendingTokenSetCurToken(size_t type, const std::string &text, size_t channel); + void handleCOLONEQUALtokenInFString(); + void createNewCurToken(size_t type, const std::string &text, size_t channel); + void pushLexerMode(size_t mode); + void popLexerMode(); + void handleFORMAT_SPECIFICATION_MODE(); + bool isDictionaryComprehensionOrSetComprehension(const std::string &code); + void insertTrailingTokens(); + void handleEOFtoken(); + void hideAndAddPendingToken(const std::unique_ptr &token); + void createAndAddPendingToken(size_t type, size_t channel, const std::string &text, const std::unique_ptr &sampleToken); + void createAndAddPendingToken(size_t type, size_t channel, const std::unique_ptr &sampleToken); + void addPendingToken(const std::unique_ptr &token); + size_t getIndentationLength(const std::string &identText); + void reportLexerError(const std::string &errMsg); + void reportError(const std::string &errMsg); + + // A stack that keeps track of the indentation lengths + std::stack indentLengthStack; + // A list where tokens are waiting to be loaded into the token stream + std::vector> pendingTokens; + // last pending token types + size_t previousPendingTokenType; + size_t lastPendingTokenTypeFromDefaultChannel; + + // The amount of opened parentheses, square brackets or curly braces + size_t opened; + // The amount of opened parentheses and square brackets in the current lexer mode + std::vector paren_or_bracket_openedStack; + // A stack that stores expression(s) between braces in fstring + std::vector braceExpressionStack; + std::string prevBraceExpression; + + // Instead of this._mode (_mode is not implemented in each ANTLR4 runtime) + size_t curLexerMode; + // Instead of this._modeStack (_modeStack is not implemented in each ANTLR4 runtime) + std::vector lexerModeStack; + bool wasSpaceIndentation; + bool wasTabIndentation; + bool wasIndentationMixedWithSpacesAndTabs; + + std::unique_ptr curToken; // current (under processing) token + std::unique_ptr ffgToken; // following (look ahead) token + + const ssize_t INVALID_LENGTH = -1; + const std::string ERR_TXT = " ERROR: "; + }; + \ No newline at end of file From 3bd33de17c7ecdc5724467babc84788133f0267a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B8=D1=85=D0=B0=D0=B8=D0=BB=20=D0=9A=D0=BE=D1=80?= =?UTF-8?q?=D0=BC=D0=B0=D0=BD=D0=BE=D0=B2=D1=81=D0=BA=D0=B8=D0=B9?= Date: Fri, 21 Mar 2025 18:06:34 +0300 Subject: [PATCH 4/6] chore: added placeholder for C++ header --- python/python3_13/PythonLexer.g4 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/python3_13/PythonLexer.g4 b/python/python3_13/PythonLexer.g4 index 98b99d4aef..da15a6a9e8 100644 --- a/python/python3_13/PythonLexer.g4 +++ b/python/python3_13/PythonLexer.g4 @@ -32,6 +32,8 @@ lexer grammar PythonLexer; options { superClass=PythonLexerBase; } +// Insert here @header for C++ lexer. + tokens { ENCODING // https://docs.python.org/3.13/reference/lexical_analysis.html#encoding-declarations , INDENT, DEDENT // https://docs.python.org/3.13/reference/lexical_analysis.html#indentation From 8ff7691db33319c872616414dac62820eafa0025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B8=D1=85=D0=B0=D0=B8=D0=BB=20=D0=9A=D0=BE=D1=80?= =?UTF-8?q?=D0=BC=D0=B0=D0=BD=D0=BE=D0=B2=D1=81=D0=BA=D0=B8=D0=B9?= Date: Fri, 21 Mar 2025 18:07:43 +0300 Subject: [PATCH 5/6] chore: added transformGrammar.py --- python/python3_13/Cpp/transformGrammar.py | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 python/python3_13/Cpp/transformGrammar.py diff --git a/python/python3_13/Cpp/transformGrammar.py b/python/python3_13/Cpp/transformGrammar.py new file mode 100644 index 0000000000..7b2f208260 --- /dev/null +++ b/python/python3_13/Cpp/transformGrammar.py @@ -0,0 +1,32 @@ +import sys, os, re, shutil +from glob import glob +from pathlib import Path + +def main(argv): + for file in glob("./*.g4"): + fix(file) + +def fix(file_path): + print("Altering " + file_path) + if not os.path.exists(file_path): + print(f"Could not find file: {file_path}") + sys.exit(1) + parts = os.path.split(file_path) + file_name = parts[-1] + shutil.move(file_path, file_path + ".bak") + input_file = open(file_path + ".bak",'r') + output_file = open(file_path, 'w') + for x in input_file: + if '// Insert here @header for lexer.' in x: + x = x.replace('// Insert here @header for lexer.', '@header {#include "PythonLexerBase.h"}') + if 'this.' in x: + x = x.replace('this.', 'this->') + output_file.write(x) + output_file.flush() + + print("Writing ...") + input_file.close() + output_file.close() + +if __name__ == '__main__': + main(sys.argv) From d96cdfa5931f8a5b81b665183e711fcc822284b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B8=D1=85=D0=B0=D0=B8=D0=BB=20=D0=9A=D0=BE=D1=80?= =?UTF-8?q?=D0=BC=D0=B0=D0=BD=D0=BE=D0=B2=D1=81=D0=BA=D0=B8=D0=B9?= Date: Fri, 21 Mar 2025 19:02:46 +0300 Subject: [PATCH 6/6] fix: fixed errors due to bad copy-paste --- python/python3_13/Cpp/PythonLexerBase.cpp | 18 +++++++++--------- python/python3_13/Cpp/PythonLexerBase.h | 1 - python/python3_13/PythonLexer.g4 | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/python/python3_13/Cpp/PythonLexerBase.cpp b/python/python3_13/Cpp/PythonLexerBase.cpp index 9fce8af49a..5672ce8401 100644 --- a/python/python3_13/Cpp/PythonLexerBase.cpp +++ b/python/python3_13/Cpp/PythonLexerBase.cpp @@ -148,7 +148,7 @@ void PythonLexerBase::setCurrentAndFollowingTokens() { if (this->ffgToken) { this->curToken = this->cloneToken(this->ffgToken); } else { - this->curToken = PythonLexer::nextToken(); + this->curToken = antlr4::Lexer::nextToken(); } this->checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)! @@ -156,7 +156,7 @@ void PythonLexerBase::setCurrentAndFollowingTokens() { if (this->curToken->getType() == PythonLexer::EOF) { this->ffgToken = this->cloneToken(this->curToken); } else { - this->ffgToken = PythonLexer::nextToken(); + this->ffgToken = antlr4::Lexer::nextToken(); } } @@ -259,7 +259,7 @@ void PythonLexerBase::insertLeadingIndentToken() { // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser this->createAndAddPendingToken( PythonLexer::INDENT, - Token::DEFAULT_CHANNEL, + antlr4::Token::DEFAULT_CHANNEL, PythonLexerBase::ERR_TXT + errMsg, this->curToken ); @@ -311,7 +311,7 @@ void PythonLexerBase::insertIndentOrDedentToken(size_t indentLength) { auto prevIndentLength = this->indentLengthStack.top(); if (indentLength > prevIndentLength) { - this->createAndAddPendingToken(PythonLexer::INDENT, Token::DEFAULT_CHANNEL, this->ffgToken); + this->createAndAddPendingToken(PythonLexer::INDENT, antlr4::Token::DEFAULT_CHANNEL, this->ffgToken); this->indentLengthStack.push(indentLength); } else { while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream @@ -319,7 +319,7 @@ void PythonLexerBase::insertIndentOrDedentToken(size_t indentLength) { prevIndentLength = this->indentLengthStack.top(); if (indentLength <= prevIndentLength) { - this->createAndAddPendingToken(PythonLexer::DEDENT, Token::DEFAULT_CHANNEL, this->ffgToken); + this->createAndAddPendingToken(PythonLexer::DEDENT, antlr4::Token::DEFAULT_CHANNEL, this->ffgToken); } else { this->reportError("inconsistent dedent"); } @@ -351,7 +351,7 @@ void PythonLexerBase::checkCurToken() { case PythonLexer::NEWLINE: // append the current brace expression with the current newline this->appendToBraceExpression(this->curToken->getText()); - this->curToken = this->cloneToken(this->curToken, Token::HIDDEN_CHANNEL); + this->curToken = this->cloneToken(this->curToken, antlr4::Token::HIDDEN_CHANNEL); break; case PythonLexer::LBRACE: // the outermost brace expression cannot be a dictionary comprehension or a set comprehension @@ -502,9 +502,9 @@ void PythonLexerBase::handleFSTRING_MIDDLEtokenWithDoubleBrace() { auto lastTwoChars = this->getLastTwoCharsOfTheCurTokenText(); if (lastTwoChars == "{{") { - this->trimLastCharAddPendingTokenSetCurToken(PythonLexer::LBRACE, "{", Token::HIDDEN_CHANNEL); + this->trimLastCharAddPendingTokenSetCurToken(PythonLexer::LBRACE, "{", antlr4::Token::HIDDEN_CHANNEL); } else if (lastTwoChars == "}}") { - this->trimLastCharAddPendingTokenSetCurToken(PythonLexer::RBRACE, "}", Token::HIDDEN_CHANNEL); + this->trimLastCharAddPendingTokenSetCurToken(PythonLexer::RBRACE, "}", antlr4::Token::HIDDEN_CHANNEL); } } @@ -618,7 +618,7 @@ void PythonLexerBase::handleFORMAT_SPECIFICATION_MODE() { bool PythonLexerBase::isDictionaryComprehensionOrSetComprehension(const std::string &code) { auto inputStream = std::make_unique(code); auto lexer = std::make_unique(inputStream.get()); - auto tokenStream = std::make_unique(lexer.get()); + auto tokenStream = std::make_unique(lexer.get()); auto parser = std::make_unique(tokenStream.get()); // Disable error listeners to suppress console output diff --git a/python/python3_13/Cpp/PythonLexerBase.h b/python/python3_13/Cpp/PythonLexerBase.h index 761cf5b400..57828582af 100644 --- a/python/python3_13/Cpp/PythonLexerBase.h +++ b/python/python3_13/Cpp/PythonLexerBase.h @@ -116,4 +116,3 @@ THE SOFTWARE. const ssize_t INVALID_LENGTH = -1; const std::string ERR_TXT = " ERROR: "; }; - \ No newline at end of file diff --git a/python/python3_13/PythonLexer.g4 b/python/python3_13/PythonLexer.g4 index da15a6a9e8..f2c036b9ae 100644 --- a/python/python3_13/PythonLexer.g4 +++ b/python/python3_13/PythonLexer.g4 @@ -32,7 +32,7 @@ lexer grammar PythonLexer; options { superClass=PythonLexerBase; } -// Insert here @header for C++ lexer. +// Insert here @header for lexer. tokens { ENCODING // https://docs.python.org/3.13/reference/lexical_analysis.html#encoding-declarations