diff --git a/.github/scripts/test_error_guidance.py b/.github/scripts/test_error_guidance.py new file mode 100644 index 000000000..af2444a36 --- /dev/null +++ b/.github/scripts/test_error_guidance.py @@ -0,0 +1,60 @@ +import io +import logging +import os +import sys +import unittest + +sys.path.insert(0, os.path.abspath(os.path.join( + os.path.dirname(__file__), "..", ".."))) + +from mlc.error_codes import get_error_guidance +from mlc.main import _report_error, logger +from mlc.script_action import ScriptExecutionError + + +class ErrorGuidanceTest(unittest.TestCase): + + def test_detects_disk_space_errors_from_message(self): + guidance = get_error_guidance( + 1, "Command execution failed with error code 28. No space left on device.") + + self.assertIsNotNone(guidance) + self.assertEqual(guidance["error_code"], 28) + self.assertIn("disk space", guidance["error_message"].lower()) + self.assertTrue(any("Free disk space" in s for s in guidance["suggestions"])) + + def test_detects_segmentation_fault_errors(self): + guidance = get_error_guidance(139, "Segmentation fault (core dumped)") + + self.assertIsNotNone(guidance) + self.assertEqual(guidance["error_code"], 139) + self.assertIn("segmentation fault", guidance["error_message"].lower()) + + def test_report_error_logs_actionable_guidance(self): + stream = io.StringIO() + handler = logging.StreamHandler(stream) + handler.setLevel(logging.ERROR) + logger.addHandler(handler) + + try: + error = ScriptExecutionError( + "Script run execution failed.", + script_name="detect,cpu", + run_args={"target": "script", "action": "run"}, + error_code=139, + error_guidance=get_error_guidance( + 139, "Segmentation fault (core dumped)") + ) + + _report_error(error) + finally: + logger.removeHandler(handler) + + output = stream.getvalue() + self.assertIn("Detected error code: 139", output) + self.assertIn("Likely cause:", output) + self.assertIn("Suggestion:", output) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/workflows/test-mlc-core-actions.yaml b/.github/workflows/test-mlc-core-actions.yaml index 40b4917b9..7392b4196 100644 --- a/.github/workflows/test-mlc-core-actions.yaml +++ b/.github/workflows/test-mlc-core-actions.yaml @@ -103,6 +103,10 @@ jobs: - name: Test 3b - pull repo --force handling run: | python .github/scripts/test_repo_pull_force.py + + - name: Test 3c - error guidance handling + run: | + python .github/scripts/test_error_guidance.py - name: Test 4 - list repo - List the existing repositories run: | diff --git a/.gitignore b/.gitignore index a988bc06a..1736a42f7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,4 @@ build dist *egg-info __pycache__ - .mlc-log.txt diff --git a/mlc/error_codes.py b/mlc/error_codes.py index e9ba5395c..b6f43199e 100644 --- a/mlc/error_codes.py +++ b/mlc/error_codes.py @@ -1,50 +1,71 @@ +import re from enum import Enum, auto + class ErrorCode(Enum): """Enum class for error codes in MLCFlow""" # General errors (2000-2007) - AUTOMATION_SCRIPT_NOT_FOUND = (2000, "The specified automation script was not found") + AUTOMATION_SCRIPT_NOT_FOUND = ( + 2000, "The specified automation script was not found") PATH_DOES_NOT_EXIST = (2001, "Provided path does not exists") FILE_NOT_FOUND = (2002, "Required file was not found") PERMISSION_DENIED = (2003, "Insufficient permission to execute the script") IO_Error = (2004, "File I/O operation failed") AUTOMATION_CUSTOM_ERROR = (2005, "Custom error triggered by the script") - UNSUPPORTED_OS = (2006, "The Operating System is not supported by the script") + UNSUPPORTED_OS = ( + 2006, "The Operating System is not supported by the script") MISSING_ENV_VARIABLE = (2007, "Required environment variables are missing") - + def __init__(self, code, description): self.code = code self.description = description + +ERROR_CODES = {error.code for error in ErrorCode} +ERROR_CODE_PATTERNS = [ + re.compile(r'(?:error|exit|return)\s+code\s*[:=]?\s*(\d+)', re.IGNORECASE), + re.compile(r'\[errno\s+(\d+)\]', re.IGNORECASE), + re.compile(r'signal\s+(\d+)', re.IGNORECASE), +] + + class WarningCode(Enum): """Enum class for warning codes in MLCFlow""" # General warnings (1000-1007) IO_WARNING = (1000, "File I/O operation warning") - AUTOMATION_SCRIPT_NOT_TESTED = (1001, "the script is not tested on the current operatinig system or is in a development state") - AUTOMATION_SCRIPT_SKIPPED = (1002, "The script has been skipped during execution") + AUTOMATION_SCRIPT_NOT_TESTED = ( + 1001, + "the script is not tested on the current operatinig system or is in a development state") + AUTOMATION_SCRIPT_SKIPPED = ( + 1002, "The script has been skipped during execution") AUTOMATION_CUSTOM_ERROR = (1003, "Custom warning triggered by the script") NON_INTERACTIVE_ENV = (1004, "Non interactive environment detected") ELEVATED_PERMISSION_NEEDED = (1005, "Elevated permission needed") EMPTY_TARGET = (1006, "The specified target is empty") - + def __init__(self, code, description): self.code = code self.description = description + def get_error_info(error_code): """Get the error message for a given error code""" try: - return {"error_code": ErrorCode(error_code).code, "error_message": ErrorCode(error_code).description} + return {"error_code": ErrorCode( + error_code).code, "error_message": ErrorCode(error_code).description} except ValueError: return f"Unknown error code: {error_code}" + def get_warning_info(warning_code): """Get the warning message for a given warning code""" try: - return {"warning_code": WarningCode(warning_code).code, "warning_message": WarningCode(warning_code).description} + return {"warning_code": WarningCode( + warning_code).code, "warning_message": WarningCode(warning_code).description} except ValueError: return f"Unknown warning code: {warning_code}" + def is_warning_code(code): """Check if a given code is a warning code""" try: @@ -56,6 +77,7 @@ def is_warning_code(code): except ValueError: return False + def is_error_code(code): """Check if a given code is an error code""" try: @@ -67,6 +89,7 @@ def is_error_code(code): except ValueError: return False + def get_code_type(code): """Get the type of a code (error or warning)""" if is_error_code(code): @@ -74,4 +97,113 @@ def get_code_type(code): elif is_warning_code(code): return "warning" else: - return "unknown" \ No newline at end of file + return "unknown" + + +def _normalize_code(code): + """Convert a code value to int when possible.""" + if code is None: + return None + if isinstance(code, int): + return code + try: + return int(str(code).strip()) + except (TypeError, ValueError): + return None + + +def detect_error_code(return_code=None, error_message=""): + """Detect the most useful error code from a return value or error text.""" + normalized_return_code = _normalize_code(return_code) + message = error_message or "" + + for pattern in ERROR_CODE_PATTERNS: + match = pattern.search(message) + if match: + detected = _normalize_code(match.group(1)) + # Prefer the code extracted from the error text when the outer + # result only carries a generic status such as 1. + if normalized_return_code in (None, 0, 1): + return detected + + return normalized_return_code + + +def get_error_guidance(return_code=None, error_message=""): + """Return actionable guidance for known error codes and failure patterns.""" + error_code = detect_error_code(return_code, error_message) + guidance = { + "error_code": error_code, + "error_message": None, + "suggestions": [], + } + + if error_code in ERROR_CODES: + error_info = get_error_info(error_code) + if isinstance(error_info, dict): + guidance["error_message"] = error_info["error_message"] + + message = (error_message or "").lower() + + if ("no space left on device" in message or + "disk full" in message or + "not enough space" in message): + guidance["error_message"] = guidance["error_message"] or \ + "Likely disk space exhaustion while running the script" + guidance["suggestions"] = [ + "Free disk space in the work/cache directories and retry the command.", + "Remove old artifacts or caches if they are no longer needed.", + ] + # 139 = 128 + SIGSEGV(11), while some tools report the raw signal number + # 11 or its negative form -11. + elif ("segmentation fault" in message or error_code in [139, -11, 11]): + guidance["error_message"] = guidance["error_message"] or \ + "A native program crashed with a segmentation fault" + guidance["suggestions"] = [ + "Rerun with verbose logs to identify which native command crashed.", + "Check native dependencies, compiler/runtime compatibility, and input files.", + ] + # Common downloader/network failure codes used by curl and similar tools: + # 6/7 resolve/connect failures, 28 timeout, 35 TLS/connect issue, + # 56 connection reset/read failure, 60 certificate validation failure. + elif ("network" in message or + "connection" in message or + "timed out" in message or + "temporary failure in name resolution" in message or + "could not resolve host" in message or + error_code in [6, 7, 28, 35, 56, 60]): + guidance["error_message"] = guidance["error_message"] or \ + "Likely network or download failure while running the script" + guidance["suggestions"] = [ + "Check internet connectivity, proxy/firewall settings, and remote endpoint availability.", + "Retry the command after verifying the network connection.", + ] + elif error_code == 126: + guidance["error_message"] = guidance["error_message"] or \ + "Command found but it could not be executed" + guidance["suggestions"] = [ + "Check file permissions and whether the target command is executable.", + ] + elif error_code == 127: + guidance["error_message"] = guidance["error_message"] or \ + "Command not found during script execution" + guidance["suggestions"] = [ + "Verify that the required tool is installed and available on PATH.", + ] + elif error_code == 137: + guidance["error_message"] = guidance["error_message"] or \ + "Process was terminated, often due to out-of-memory or a kill signal" + guidance["suggestions"] = [ + "Check system memory limits and retry with fewer parallel jobs if possible.", + ] + elif error_code == 130: + guidance["error_message"] = guidance["error_message"] or \ + "The command was interrupted by the user or the environment" + guidance["suggestions"] = [ + "Retry the command if the interruption was unexpected.", + ] + + if not guidance["error_message"] and not guidance["suggestions"]: + return None + + return guidance diff --git a/mlc/main.py b/mlc/main.py index 7abe39d28..0250ad303 100644 --- a/mlc/main.py +++ b/mlc/main.py @@ -155,6 +155,7 @@ def _report_error(e): script_name = e.script_name repo_alias = e.repo_alias run_args = e.run_args + error_guidance = e.error_guidance or {} if script_name: # Build rerun command with user-facing inputs only @@ -181,6 +182,18 @@ def _report_error(e): logger.error(f"Failed script: {script_name}") logger.error(f"To rerun just the failed part: {rerun_cmd}") + if error_guidance: + error_code = error_guidance.get("error_code") + if error_code is not None: + logger.error(f"Detected error code: {error_code}") + + error_message = error_guidance.get("error_message") + if error_message: + logger.error(f"Likely cause: {error_message}") + + for suggestion in error_guidance.get("suggestions", []): + logger.error(f"Suggestion: {suggestion}") + if e.version_info_file: logger.error(f"Dependency versions: {e.version_info_file}") diff --git a/mlc/script_action.py b/mlc/script_action.py index 3bdc02f83..bee4d9f00 100644 --- a/mlc/script_action.py +++ b/mlc/script_action.py @@ -7,6 +7,7 @@ import inspect from .index import Index from . import utils +from .error_codes import get_error_guidance from .logger import logger @@ -319,6 +320,8 @@ def call_script_module_function(self, function_name, run_args): if result['return'] > 0: error = result.get('error', "") + error_guidance = get_error_guidance( + result.get('error_code', result.get('return')), error) _name_match = re.search(r'name\s*=\s*([^,)]+)', error) _script_name = _name_match.group(1).strip() if _name_match else run_args.get( 'tags', run_args.get('details')) @@ -338,7 +341,10 @@ def call_script_module_function(self, function_name, run_args): raise ScriptExecutionError( f"Script {function_name} execution failed in {module_path}. \nError : {error}", script_name=_script_name, repo_alias=_repo_alias, module_path=module_path, - run_args=run_args, version_info_file=_version_info_file) + run_args=run_args, version_info_file=_version_info_file, + error_code=error_guidance.get( + 'error_code') if error_guidance else None, + error_guidance=error_guidance) if str(run_args.get("mlc_output")).lower() in [ "on", "true", "yes", "1"]: @@ -680,10 +686,13 @@ def remote_docker(self, run_args): class ScriptExecutionError(Exception): def __init__(self, message, script_name=None, repo_alias=None, - module_path=None, run_args=None, version_info_file=None): + module_path=None, run_args=None, version_info_file=None, + error_code=None, error_guidance=None): super().__init__(message) self.script_name = script_name self.repo_alias = repo_alias self.module_path = module_path self.run_args = run_args or {} self.version_info_file = version_info_file + self.error_code = error_code + self.error_guidance = error_guidance