diff --git a/sherlock_project/sites.py b/sherlock_project/sites.py index b7aaf4c58b..d89b14cd5a 100644 --- a/sherlock_project/sites.py +++ b/sherlock_project/sites.py @@ -4,9 +4,16 @@ This is the raw data that will be used to search for usernames. """ import json +import os import requests import secrets +try: + from jsonschema import validate, ValidationError + _HAS_JSONSCHEMA = True +except ImportError: + _HAS_JSONSCHEMA = False + MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json" EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt" @@ -162,6 +169,31 @@ def __init__( f"data file '{data_file_path}'." ) + # Validate remote manifest against local schema when jsonschema is available. + # If validation fails, fall back to the local manifest so older versions + # degrade gracefully when the remote schema evolves. + if _HAS_JSONSCHEMA and data_file_path.lower().startswith("http"): + schema_path = os.path.join( + os.path.dirname(__file__), "resources", "data.schema.json" + ) + try: + with open(schema_path, "r", encoding="utf-8") as sf: + schema_data = json.load(sf) + validate(instance=site_data, schema=schema_data) + except ValidationError: + print( + "Warning: Remote manifest failed schema validation, " + "falling back to local data." + ) + local_path = os.path.join( + os.path.dirname(__file__), "resources", "data.json" + ) + with open(local_path, "r", encoding="utf-8") as lf: + site_data = json.load(lf) + except Exception: + # Schema file missing or unreadable -- continue without validation + pass + site_data.pop('$schema', None) if honor_exclusions: diff --git a/tests/test_manifest.py b/tests/test_manifest.py index b73e92408f..e480c0e729 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -30,6 +30,37 @@ def test_validate_manifest_against_remote_schema(remote_schema): validate(instance=jsondat, schema=remote_schema) +def test_schema_validation_fallback(tmp_path, capsys): + """Manifest failing schema validation falls back to local.""" + from sherlock_project.sites import SitesInformation + + # Write an invalid manifest (missing required keys on every entry) + bad_manifest = {"BadSite": {"not_a_valid_key": True}} + bad_file = tmp_path / "bad_data.json" + bad_file.write_text(json.dumps(bad_manifest)) + + # Loading the invalid file directly should raise (no fallback for local files) + with pytest.raises(ValueError): + SitesInformation(data_file_path=str(bad_file)) + + +def test_schema_validation_passes_valid_manifest(): + """Ensures the local manifest passes schema validation at import time.""" + json_relative: str = '../sherlock_project/resources/data.json' + schema_relative: str = '../sherlock_project/resources/data.schema.json' + json_path: str = os.path.join(os.path.dirname(__file__), json_relative) + schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative) + + with open(json_path, 'r') as f: + jsondat = json.load(f) + with open(schema_path, 'r') as f: + schemadat = json.load(f) + + from jsonschema import validate as jvalidate + # Should not raise + jvalidate(instance=jsondat, schema=schemadat) + + # Ensure that the expected values are beind returned by the site list @pytest.mark.parametrize("target_name,target_expected_err_type", [ ('GitHub', 'status_code'),