Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions sherlock_project/sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@
This is the raw data that will be used to search for usernames.
"""
import json
import os
import requests
import secrets

try:
from jsonschema import validate, ValidationError
_HAS_JSONSCHEMA = True
except ImportError:
_HAS_JSONSCHEMA = False


MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
Expand Down Expand Up @@ -162,6 +169,31 @@ def __init__(
f"data file '{data_file_path}'."
)

# Validate remote manifest against local schema when jsonschema is available.
# If validation fails, fall back to the local manifest so older versions
# degrade gracefully when the remote schema evolves.
if _HAS_JSONSCHEMA and data_file_path.lower().startswith("http"):
schema_path = os.path.join(
os.path.dirname(__file__), "resources", "data.schema.json"
)
try:
with open(schema_path, "r", encoding="utf-8") as sf:
schema_data = json.load(sf)
validate(instance=site_data, schema=schema_data)
except ValidationError:
print(
"Warning: Remote manifest failed schema validation, "
"falling back to local data."
)
local_path = os.path.join(
os.path.dirname(__file__), "resources", "data.json"
)
with open(local_path, "r", encoding="utf-8") as lf:
site_data = json.load(lf)
except Exception:
# Schema file missing or unreadable -- continue without validation
pass

site_data.pop('$schema', None)

if honor_exclusions:
Expand Down
31 changes: 31 additions & 0 deletions tests/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,37 @@ def test_validate_manifest_against_remote_schema(remote_schema):

validate(instance=jsondat, schema=remote_schema)

def test_schema_validation_fallback(tmp_path, capsys):
"""Manifest failing schema validation falls back to local."""
from sherlock_project.sites import SitesInformation

# Write an invalid manifest (missing required keys on every entry)
bad_manifest = {"BadSite": {"not_a_valid_key": True}}
bad_file = tmp_path / "bad_data.json"
bad_file.write_text(json.dumps(bad_manifest))

# Loading the invalid file directly should raise (no fallback for local files)
with pytest.raises(ValueError):
SitesInformation(data_file_path=str(bad_file))


def test_schema_validation_passes_valid_manifest():
"""Ensures the local manifest passes schema validation at import time."""
json_relative: str = '../sherlock_project/resources/data.json'
schema_relative: str = '../sherlock_project/resources/data.schema.json'
json_path: str = os.path.join(os.path.dirname(__file__), json_relative)
schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative)

with open(json_path, 'r') as f:
jsondat = json.load(f)
with open(schema_path, 'r') as f:
schemadat = json.load(f)

from jsonschema import validate as jvalidate
# Should not raise
jvalidate(instance=jsondat, schema=schemadat)


# Ensure that the expected values are beind returned by the site list
@pytest.mark.parametrize("target_name,target_expected_err_type", [
('GitHub', 'status_code'),
Expand Down