diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 1c67166d..491e1d6b 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -290,8 +290,12 @@ function launching_scripts { # IMAGES screen -S "Script_AIL" -X screen -t "Exif" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Exif.py; read x" sleep 0.1 + screen -S "Script_AIL" -X screen -t "ImagePhash" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./ImagePhash.py; read x" + sleep 0.1 screen -S "Script_AIL" -X screen -t "OcrExtractor" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./OcrExtractor.py; read x" sleep 0.1 + screen -S "Script_AIL" -X screen -t "PhashCorrelation" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./PhashCorrelation.py; read x" + sleep 0.1 screen -S "Script_AIL" -X screen -t "CodeReader" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./CodeReader.py; read x" sleep 0.1 diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index 9b5a9ec5..ff93d553 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -18,7 +18,7 @@ AIL_OBJECTS = {'author', 'barcode', 'chat', 'chat-subchannel', 'chat-thread', 'cookie-name', 'cve', 'cryptocurrency', 'decoded', 'domain', 'dom-hash', 'etag', 'favicon', 'file-name', 'gtracker', 'hhhash', 'ip', - 'item', 'image', 'mail', 'message', 'ocr', 'pdf', 'pgp', 'qrcode', 'ssh-key', 'screenshot', 'title', + 'item', 'image', 'mail', 'message', 'ocr', 'pdf', 'pgp', 'phash', 'qrcode', 'ssh-key', 'screenshot', 'title', 'user-account', 'username'} AIL_OBJECTS_WITH_SUBTYPES = {'chat', 'chat-subchannel', 'cryptocurrency', 'pgp', 'username', 'user-account'} @@ -26,7 +26,7 @@ # TODO by object TYPE ???? correlation AIL_OBJECTS_CORRELATIONS_DEFAULT = {'author', 'barcode', 'chat', 'chat-subchannel', 'chat-thread', 'cve', 'cryptocurrency', 'decoded', 'domain', 'dom-hash', 'favicon', 'file-name', 'gtracker', 'item', - 'image', 'ip', 'mail', 'message', 'ocr', 'pdf', 'pgp', 'qrcode', 'screenshot', + 'image', 'ip', 'mail', 'message', 'ocr', 'pdf', 'pgp', 'phash', 'qrcode', 'screenshot', 'ssh-key', 'title', 'user-account', 'username'} AIL_OBJS_QUEUES = {'barcode', 'decoded', 'file-name', 'image', 'item', 'message', 'ocr', 'pgp', 'qrcode', 'screenshot', 'title'} # ADD TAGS ??? diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py index 1d63192f..7da09586 100755 --- a/bin/lib/correlations_engine.py +++ b/bin/lib/correlations_engine.py @@ -57,7 +57,7 @@ "file-name": ["chat", "item", "message", "pdf"], "gtracker": ["domain", "item"], "hhhash": ["domain"], - "image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ???? + "image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "phash", "qrcode", "user-account"], # TODO subchannel + threads ???? "ip": ["ssh-key"], "item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "file-name", "gtracker", "mail", "message", "pdf", "pgp", "screenshot", "title", "username"], # chat ??? "mail": ["domain", "item", "message"], # chat ?? @@ -65,6 +65,7 @@ "ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"], "pdf": ["author", "chat", "file-name", "item", "message"], "pgp": ["chat", "domain", "item", "message", "ocr"], + "phash": ["image", "phash"], "qrcode": ["chat", "cve", "cryptocurrency", "decoded", "domain", "image", "message", "screenshot"], # "chat-subchannel", "chat-thread" ????? "screenshot": ["barcode", "domain", "item", "qrcode"], "ssh-key": ["domain", "ip"], diff --git a/bin/lib/objects/Phashs.py b/bin/lib/objects/Phashs.py new file mode 100644 index 00000000..ef9ff681 --- /dev/null +++ b/bin/lib/objects/Phashs.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +from flask import url_for +from pymisp import MISPObject + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader +from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects + +config_loader = ConfigLoader() +r_objects = config_loader.get_db_conn("Kvrocks_Objects") +baseurl = config_loader.get_config_str("Notifications", "ail_domain") +config_loader = None + + +class Phash(AbstractDaterangeObject): + """ + AIL Perceptual Hash Object. + """ + + def __init__(self, phash_id): + super(Phash, self).__init__('phash', phash_id) + + def delete(self): + # TODO: Implement deletion + pass + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('correlation.show_correlation', type=self.type, id=self.id) + else: + url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' + return url + + def get_svg_icon(self): + return {'style': 'fas', 'icon': '\uf302', 'color': '#4287f5', 'radius': 5} + + def get_misp_object(self): + obj_attrs = [] + obj = MISPObject('phash') + first_seen = self.get_first_seen() + last_seen = self.get_last_seen() + if first_seen: + obj.first_seen = first_seen + if last_seen: + obj.last_seen = last_seen + if not first_seen or not last_seen: + self.logger.warning( + f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={first_seen}, last={last_seen}') + + obj_attrs.append(obj.add_attribute('phash', value=self.get_id())) + for obj_attr in obj_attrs: + for tag in self.get_tags(): + obj_attr.add_tag(tag) + return obj + + def get_nb_seen(self): + return self.get_nb_correlation('image') + + def get_meta(self, options=set()): + meta = self._get_meta(options=options) + meta['id'] = self.id + meta['tags'] = self.get_tags(r_list=True) + return meta + + def create(self, _first_seen=None, _last_seen=None): + self._create() + + +def hamming_distance(phash1, phash2): + """ + Calculate Hamming distance between two phash values. + + Args: + phash1: First phash value (hex string) + phash2: Second phash value (hex string) + + Returns: + int: Hamming distance (0-64 for 16-character hex phash) + """ + if not phash1 or not phash2: + return None + if len(phash1) != len(phash2): + return None + + # Convert hex strings to integers and XOR them + try: + xor_result = int(phash1, 16) ^ int(phash2, 16) + # Count the number of 1 bits + distance = bin(xor_result).count('1') + return distance + except (ValueError, TypeError): + return None + + +def calculate_phash(image_content): + """ + Calculate perceptual hash for image content. + + Args: + image_content: Image content as bytes or PIL Image + + Returns: + str: Phash value as hex string, or None if calculation fails + """ + try: + import imagehash + from PIL import Image + from io import BytesIO + + # Handle bytes input + if isinstance(image_content, bytes): + image_content = Image.open(BytesIO(image_content)) + + # Calculate phash + phash_value = str(imagehash.phash(image_content)) + return phash_value + except ImportError: + return None + except Exception: + return None + + +def add_to_bktree_index(phash_value): + """ + Add a phash value to the BK-tree index. + + The BK-tree is stored in KVRocks using: + - phash:bktree:root - stores the root node phash + - phash:bktree:{phash}:children - hash map storing distance -> child_phash + + Args: + phash_value: Phash value to add to the index + """ + if not phash_value: + return + + # Check if root exists + root = r_objects.get('phash:bktree:root') + + if not root: + # First phash becomes the root + r_objects.set('phash:bktree:root', phash_value) + return + + # Traverse tree to find insertion point + current = root + while True: + # Calculate distance from current node to new phash + distance = hamming_distance(current, phash_value) + + if distance is None: + return + + if distance == 0: + # Duplicate phash, no need to add + return + + # Check if child exists at this distance + child_key = f'phash:bktree:{current}:children' + child = r_objects.hget(child_key, str(distance)) + + if not child: + # No child at this distance, insert here + r_objects.hset(child_key, str(distance), phash_value) + return + else: + # Continue traversing + current = child + + +def search_bktree_index(query_phash, max_distance): + """ + Search the BK-tree index for similar phash values. + + Uses triangle inequality to prune branches: + If |d(node, query) - d(node, child)| > max_distance, skip that subtree + + Args: + query_phash: Phash value to search for + max_distance: Maximum Hamming distance for matches + + Returns: + list: List of (phash_value, distance) tuples for all matches + """ + if not query_phash: + return [] + + root = r_objects.get('phash:bktree:root') + if not root: + # Empty tree + return [] + + results = [] + candidates = [root] + + while candidates: + current = candidates.pop() + + # Calculate distance from current node to query + distance = hamming_distance(current, query_phash) + + if distance is None: + continue + + # If within threshold, add to results + if distance <= max_distance: + results.append((current, distance)) + + # Get children of current node + child_key = f'phash:bktree:{current}:children' + children = r_objects.hgetall(child_key) + + if children: + # Check each child + for child_distance_str, child_phash in children.items(): + child_distance = int(child_distance_str) + + # Triangle inequality: only explore if it could contain matches + # |d(node, query) - d(node, child)| <= d(query, child) + # So if |d(node, query) - d(node, child)| > max_distance, skip + if abs(distance - child_distance) <= max_distance: + candidates.append(child_phash) + + return results + + +def rebuild_bktree_index(): + """ + Rebuild the BK-tree index from all existing phash objects. + + This should be called: + - After importing old data + - After index corruption + - During migration + + Returns: + int: Number of phashes indexed + """ + # Clear existing index + root = r_objects.get('phash:bktree:root') + if root: + r_objects.delete('phash:bktree:root') + # Clear all children keys - we'll need to iterate through all phashes + + # Get all phash objects + phashs = Phashs() + count = 0 + + for phash_id in phashs.get_ids_iterator(): + add_to_bktree_index(phash_id) + count += 1 + + return count + + +def create(phash_value): + """ + Create a Phash object and add it to the BK-tree index. + + Args: + phash_value: Phash value (hex string) + + Returns: + Phash: Phash object + """ + obj = Phash(phash_value) + if not obj.exists(): + obj.create() + add_to_bktree_index(phash_value) + return obj + + +class Phashs(AbstractDaterangeObjects): + """ + Phashs Objects Collection + """ + def __init__(self): + super().__init__('phash', Phash) + + def get_name(self): + return 'Phashs' + + def get_icon(self): + return {'fas': 'fas', 'icon': 'fingerprint'} + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('objects_phash.objects_phashs') + else: + url = f'{baseurl}/objects/phashes' + return url + + def sanitize_id_to_search(self, name_to_search): + return name_to_search diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py index d6cdbd24..9ae8fa43 100755 --- a/bin/lib/objects/ail_objects.py +++ b/bin/lib/objects/ail_objects.py @@ -46,6 +46,7 @@ from lib.objects import Ocrs from lib.objects import PDFs from lib.objects import Pgps +from lib.objects import Phashs from lib.objects import QrCodes from lib.objects import Screenshots from lib.objects import SSHKeys @@ -82,6 +83,7 @@ 'ocr': {'obj': Ocrs.Ocr, 'objs': Ocrs.Ocrs}, 'pdf': {'obj': PDFs.PDF, 'objs': PDFs.PDFs}, 'pgp': {'obj': Pgps.Pgp, 'objs': Pgps.Pgps}, + 'phash': {'obj': Phashs.Phash, 'objs': Phashs.Phashs}, 'qrcode': {'obj': QrCodes.Qrcode, 'objs': QrCodes.Qrcodes}, 'screenshot': {'obj': Screenshots.Screenshot, 'objs': None}, #################################################################################################### 'ssh-key': {'obj': SSHKeys.SSHKey, 'objs': SSHKeys.SSHKeys}, diff --git a/bin/modules/ImagePhash.py b/bin/modules/ImagePhash.py new file mode 100644 index 00000000..6d8173e2 --- /dev/null +++ b/bin/modules/ImagePhash.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +""" +ImagePhash Module +=================== + +Process images from the Image queue and: +1. Calculate perceptual hash (phash) +2. Store phash in Image metadata +3. Create Phash objects +4. Create Phash ↔ Image correlations + +""" + +import os +import sys + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib.objects import Images +from lib.objects import Phashs + + +class ImagePhash(AbstractModule): + """ + ImagePhash module: Calculate and store perceptual hashes for images + """ + + def __init__(self): + super(ImagePhash, self).__init__() + + # Send to PhashCorrelation queue + self.obj = None + + def compute(self, message, r_result=False): + """ + Process an image and calculate its phash. + + Args: + message: Image ID + r_result: Return result for testing + """ + image_id = message + + # Get image object + image = Images.Image(image_id) + + if not image.exists(): + return None + + # Get image content + image_content = image.get_content(r_type='bytes') + + if not image_content: + return None + + # Calculate phash + phash_value = Phashs.calculate_phash(image_content) + + if not phash_value: + # imagehash library not available or calculation failed + return None + + # Create Phash object + phash_obj = Phashs.create(phash_value) + + # Add correlation: Phash ↔ Image + phash_obj.add_correlation('image', '', image_id) + + # Send to PhashCorrelation queue for similarity detection + self.send_message_to_queue(phash_value, 'PhashCorrelation') + + if r_result: + return phash_value + + return None + + +if __name__ == '__main__': + module = ImagePhash() + module.run() diff --git a/bin/modules/PhashCorrelation.py b/bin/modules/PhashCorrelation.py new file mode 100644 index 00000000..b5ff011d --- /dev/null +++ b/bin/modules/PhashCorrelation.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +""" +PhashCorrelation Module +======================== + +Find similar phash values using BK-tree indexing and create correlations. + +Features: +- Efficient similarity search using BK-tree (O(log n) vs O(n)) +- Configurable Hamming distance threshold +- Graceful fallback to linear search if BK-tree fails +- Creates Phash ↔ Phash correlations + +""" + +import os +import sys + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib.objects import Phashs +from lib.ConfigLoader import ConfigLoader + + +class PhashCorrelation(AbstractModule): + """ + PhashCorrelation module: Find similar phashes using BK-tree indexing + """ + + def __init__(self): + super(PhashCorrelation, self).__init__() + + # Load configuration + config_loader = ConfigLoader() + self.max_distance = config_loader.get_config_int('Images', 'phash_max_hamming_distance') + if self.max_distance is None: + self.max_distance = 8 # Default threshold + config_loader = None + + self.obj = None + + def compute(self, message, r_result=False): + """ + Find similar phashes using BK-tree search. + + Args: + message: Phash value to search for + r_result: Return result for testing + """ + phash_value = message + + # Get Phash object + phash_obj = Phashs.Phash(phash_value) + + if not phash_obj.exists(): + return None + + # Search for similar phashes using BK-tree + similar_phashes = self._find_similar_phashes(phash_value) + + # Create correlations + for similar_phash, distance in similar_phashes: + # Don't correlate with self + if similar_phash != phash_value: + # Add bidirectional correlation: Phash ↔ Phash + phash_obj.add_correlation('phash', '', similar_phash) + + if r_result: + return similar_phashes + + return None + + def _find_similar_phashes(self, query_phash): + """ + Find similar phashes using BK-tree index. + + Falls back to linear search if BK-tree search fails. + + Args: + query_phash: Phash value to search for + + Returns: + list: List of (phash_value, distance) tuples + """ + try: + # Try BK-tree search first (efficient) + results = Phashs.search_bktree_index(query_phash, self.max_distance) + + if results: + return results + + # If no results but tree might not be built yet, fall back + return self._linear_search_fallback(query_phash) + + except Exception as e: + # If BK-tree search fails, fall back to linear search + self.logger.warning(f'BK-tree search failed, using linear fallback: {e}') + return self._linear_search_fallback(query_phash) + + def _linear_search_fallback(self, query_phash): + """ + Linear search fallback for when BK-tree is not available. + + This is much slower (O(n)) but ensures functionality. + + Args: + query_phash: Phash value to search for + + Returns: + list: List of (phash_value, distance) tuples + """ + results = [] + + # Iterate through all phashes + phashs = Phashs.Phashs() + for phash_id in phashs.get_ids_iterator(): + distance = Phashs.hamming_distance(query_phash, phash_id) + + if distance is not None and distance <= self.max_distance: + results.append((phash_id, distance)) + + return results + + +if __name__ == '__main__': + module = PhashCorrelation() + module.run() diff --git a/bin/tools/rebuild_phash_index.py b/bin/tools/rebuild_phash_index.py new file mode 100755 index 00000000..43aa1e63 --- /dev/null +++ b/bin/tools/rebuild_phash_index.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +""" +Rebuild Phash BK-Tree Index +============================= + +This script rebuilds the BK-tree index for all existing phash objects. + +Use this when: +- Importing old data that had phashes but no index +- Recovering from index corruption +- Migrating to the BK-tree implementation + +""" + +import os +import sys + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects import Phashs + + +def main(): + """ + Rebuild the BK-tree index from all existing phash objects. + """ + print('Rebuilding Phash BK-tree index...') + + count = Phashs.rebuild_bktree_index() + + print(f'BK-tree index rebuilt successfully.') + print(f'Total phashes indexed: {count}') + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/configs/core.cfg.sample b/configs/core.cfg.sample index 3cfc65fa..15acaae5 100644 --- a/configs/core.cfg.sample +++ b/configs/core.cfg.sample @@ -89,6 +89,7 @@ DiffMaxLineLength = 10000 [Images] ollama_url = http://127.0.0.1:11434 ollama_enabled = True +phash_max_hamming_distance = 8 ##### Users ##### [Users] diff --git a/configs/modules.cfg b/configs/modules.cfg index c936adf2..3d1273d3 100644 --- a/configs/modules.cfg +++ b/configs/modules.cfg @@ -154,6 +154,10 @@ publish = Tags subscribe = Image publish = Tags +[ImagePhash] +subscribe = Image +publish = PhashCorrelation + [OcrExtractor] subscribe = Image publish = Item @@ -164,6 +168,9 @@ publish = Item subscribe = Images publish = Item,Tags +[PhashCorrelation] +subscribe = PhashCorrelation + ######## TITLES ######## [CEDetector] diff --git a/requirements.txt b/requirements.txt index 2727f95d..02f2d9ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -94,6 +94,9 @@ easyocr qreader pyzbar +# Perceptual hashing +imagehash>=4.3.0 + # Tests nose2>=0.12.0 coverage>=5.5 diff --git a/tests/test_modules.py b/tests/test_modules.py index 0b427697..1ef5fddc 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -184,5 +184,33 @@ def test_module(self): self.module.compute(None) +class TestModuleImagePhash(unittest.TestCase): + + def setUp(self): + # Import here to avoid circular dependency + from modules.ImagePhash import ImagePhash + self.module = ImagePhash() + self.module.debug = True + + def test_module_no_image(self): + """Test module with non-existent image""" + result = self.module.compute('nonexistent_image_id', r_result=True) + self.assertIsNone(result) + + +class TestModulePhashCorrelation(unittest.TestCase): + + def setUp(self): + # Import here to avoid circular dependency + from modules.PhashCorrelation import PhashCorrelation + self.module = PhashCorrelation() + self.module.debug = True + + def test_module_no_phash(self): + """Test module with non-existent phash""" + result = self.module.compute('nonexistent_phash', r_result=True) + self.assertIsNone(result) + + if __name__ == '__main__': unittest.main() diff --git a/tests/test_objects_phashes.py b/tests/test_objects_phashes.py new file mode 100644 index 00000000..cf764794 --- /dev/null +++ b/tests/test_objects_phashes.py @@ -0,0 +1,418 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import sys +import unittest + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader +from lib.objects import Phashs + +# Get test config +config_loader = ConfigLoader() +r_objects = config_loader.get_db_conn("Kvrocks_Objects") + + +class TestPhashObject(unittest.TestCase): + """Test Phash object functionality""" + + def setUp(self): + """Set up test fixtures""" + self.test_phash = "a1b2c3d4e5f67890" + self.phash_obj = Phashs.Phash(self.test_phash) + + def tearDown(self): + """Clean up after tests""" + # Clean up test phash object + if self.phash_obj.exists(): + r_objects.delete(f'meta:phash:{self.test_phash}') + + # Clean up BK-tree index + root = r_objects.get('phash:bktree:root') + if root: + r_objects.delete('phash:bktree:root') + # Clean up all children keys for test phashes + for key in r_objects.keys('phash:bktree:*:children'): + r_objects.delete(key) + + def test_phash_creation(self): + """Test creating a Phash object""" + self.assertFalse(self.phash_obj.exists()) + + self.phash_obj.create() + self.assertTrue(self.phash_obj.exists()) + + def test_phash_get_id(self): + """Test getting Phash ID""" + self.assertEqual(self.phash_obj.get_id(), self.test_phash) + + def test_phash_get_type(self): + """Test getting Phash type""" + self.assertEqual(self.phash_obj.get_type(), 'phash') + + def test_phash_get_link(self): + """Test getting Phash link""" + link = self.phash_obj.get_link() + self.assertIn('phash', link) + self.assertIn(self.test_phash, link) + + def test_phash_get_meta(self): + """Test getting Phash metadata""" + self.phash_obj.create() + meta = self.phash_obj.get_meta() + + self.assertIsInstance(meta, dict) + self.assertEqual(meta['id'], self.test_phash) + self.assertIn('tags', meta) + + +class TestHammingDistance(unittest.TestCase): + """Test Hamming distance calculation""" + + def test_hamming_distance_identical(self): + """Test Hamming distance for identical phashes""" + phash1 = "a1b2c3d4e5f67890" + phash2 = "a1b2c3d4e5f67890" + distance = Phashs.hamming_distance(phash1, phash2) + self.assertEqual(distance, 0) + + def test_hamming_distance_different(self): + """Test Hamming distance for different phashes""" + phash1 = "0000000000000000" + phash2 = "ffffffffffffffff" + distance = Phashs.hamming_distance(phash1, phash2) + self.assertEqual(distance, 64) # All 64 bits different + + def test_hamming_distance_one_bit(self): + """Test Hamming distance with one bit different""" + phash1 = "0000000000000000" + phash2 = "0000000000000001" + distance = Phashs.hamming_distance(phash1, phash2) + self.assertEqual(distance, 1) + + def test_hamming_distance_none_input(self): + """Test Hamming distance with None input""" + distance = Phashs.hamming_distance(None, "a1b2c3d4e5f67890") + self.assertIsNone(distance) + + distance = Phashs.hamming_distance("a1b2c3d4e5f67890", None) + self.assertIsNone(distance) + + def test_hamming_distance_different_lengths(self): + """Test Hamming distance with different length inputs""" + phash1 = "a1b2c3d4" + phash2 = "a1b2c3d4e5f67890" + distance = Phashs.hamming_distance(phash1, phash2) + self.assertIsNone(distance) + + def test_hamming_distance_invalid_hex(self): + """Test Hamming distance with invalid hex strings""" + phash1 = "invalid_hex" + phash2 = "a1b2c3d4e5f67890" + distance = Phashs.hamming_distance(phash1, phash2) + self.assertIsNone(distance) + + +class TestBKTreeIndexing(unittest.TestCase): + """Test BK-tree indexing functionality""" + + def setUp(self): + """Set up test fixtures""" + # Clean up any existing BK-tree + root = r_objects.get('phash:bktree:root') + if root: + r_objects.delete('phash:bktree:root') + for key in r_objects.keys('phash:bktree:*:children'): + r_objects.delete(key) + + def tearDown(self): + """Clean up after tests""" + # Clean up BK-tree + root = r_objects.get('phash:bktree:root') + if root: + r_objects.delete('phash:bktree:root') + for key in r_objects.keys('phash:bktree:*:children'): + r_objects.delete(key) + + # Clean up test phash objects + for i in range(10): + phash_id = f"000000000000000{i:x}" + r_objects.delete(f'meta:phash:{phash_id}') + + def test_bktree_add_root(self): + """Test adding root to BK-tree""" + phash = "a1b2c3d4e5f67890" + + Phashs.add_to_bktree_index(phash) + + root = r_objects.get('phash:bktree:root') + self.assertEqual(root, phash) + + def test_bktree_add_multiple(self): + """Test adding multiple phashes to BK-tree""" + phashes = [ + "0000000000000000", + "0000000000000001", + "0000000000000003", + "00000000000000ff" + ] + + for phash in phashes: + Phashs.add_to_bktree_index(phash) + + root = r_objects.get('phash:bktree:root') + self.assertEqual(root, phashes[0]) + + # Check that children are added + children_key = f'phash:bktree:{phashes[0]}:children' + children = r_objects.hgetall(children_key) + self.assertGreater(len(children), 0) + + def test_bktree_add_duplicate(self): + """Test adding duplicate phash to BK-tree""" + phash = "a1b2c3d4e5f67890" + + Phashs.add_to_bktree_index(phash) + Phashs.add_to_bktree_index(phash) # Add again + + root = r_objects.get('phash:bktree:root') + self.assertEqual(root, phash) + + # Should not create children for duplicate + children_key = f'phash:bktree:{phash}:children' + children = r_objects.hgetall(children_key) + self.assertEqual(len(children), 0) + + def test_bktree_add_none(self): + """Test adding None to BK-tree""" + Phashs.add_to_bktree_index(None) + + root = r_objects.get('phash:bktree:root') + self.assertIsNone(root) + + +class TestBKTreeSearch(unittest.TestCase): + """Test BK-tree search functionality""" + + def setUp(self): + """Set up test fixtures with populated BK-tree""" + # Clean up any existing BK-tree + root = r_objects.get('phash:bktree:root') + if root: + r_objects.delete('phash:bktree:root') + for key in r_objects.keys('phash:bktree:*:children'): + r_objects.delete(key) + + # Build test tree with known phashes + self.test_phashes = [ + "0000000000000000", # Distance 0 from query + "0000000000000001", # Distance 1 from query + "0000000000000003", # Distance 2 from query + "0000000000000007", # Distance 3 from query + "000000000000000f", # Distance 4 from query + "00000000000000ff", # Distance 8 from query + "0000000000000fff", # Distance 12 from query + ] + + for phash in self.test_phashes: + Phashs.add_to_bktree_index(phash) + + def tearDown(self): + """Clean up after tests""" + # Clean up BK-tree + root = r_objects.get('phash:bktree:root') + if root: + r_objects.delete('phash:bktree:root') + for key in r_objects.keys('phash:bktree:*:children'): + r_objects.delete(key) + + # Clean up test phash objects + for phash in self.test_phashes: + r_objects.delete(f'meta:phash:{phash}') + + def test_bktree_search_exact_match(self): + """Test searching for exact match""" + query = "0000000000000000" + results = Phashs.search_bktree_index(query, max_distance=0) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0][0], query) + self.assertEqual(results[0][1], 0) + + def test_bktree_search_threshold_1(self): + """Test searching with threshold 1""" + query = "0000000000000000" + results = Phashs.search_bktree_index(query, max_distance=1) + + # Should find phashes with distance 0 and 1 + distances = [r[1] for r in results] + self.assertIn(0, distances) + self.assertIn(1, distances) + self.assertTrue(all(d <= 1 for d in distances)) + + def test_bktree_search_threshold_8(self): + """Test searching with default threshold 8""" + query = "0000000000000000" + results = Phashs.search_bktree_index(query, max_distance=8) + + # Should find multiple phashes + self.assertGreater(len(results), 0) + distances = [r[1] for r in results] + self.assertTrue(all(d <= 8 for d in distances)) + + def test_bktree_search_empty_tree(self): + """Test searching in empty tree""" + # Clean up tree + r_objects.delete('phash:bktree:root') + + query = "0000000000000000" + results = Phashs.search_bktree_index(query, max_distance=8) + + self.assertEqual(len(results), 0) + + def test_bktree_search_none_query(self): + """Test searching with None query""" + results = Phashs.search_bktree_index(None, max_distance=8) + self.assertEqual(len(results), 0) + + def test_bktree_search_large_threshold(self): + """Test searching with large threshold""" + query = "0000000000000000" + results = Phashs.search_bktree_index(query, max_distance=64) + + # Should find all phashes in tree + self.assertGreater(len(results), 0) + + +class TestRebuildBKTreeIndex(unittest.TestCase): + """Test BK-tree index rebuilding""" + + def setUp(self): + """Set up test fixtures""" + # Clean up any existing BK-tree + root = r_objects.get('phash:bktree:root') + if root: + r_objects.delete('phash:bktree:root') + for key in r_objects.keys('phash:bktree:*:children'): + r_objects.delete(key) + + # Create some test phash objects without adding to tree + self.test_phashes = [ + "1111111111111111", + "2222222222222222", + "3333333333333333" + ] + + for phash in self.test_phashes: + obj = Phashs.Phash(phash) + obj.create() + + def tearDown(self): + """Clean up after tests""" + # Clean up BK-tree + root = r_objects.get('phash:bktree:root') + if root: + r_objects.delete('phash:bktree:root') + for key in r_objects.keys('phash:bktree:*:children'): + r_objects.delete(key) + + # Clean up test phash objects + for phash in self.test_phashes: + r_objects.delete(f'meta:phash:{phash}') + + def test_rebuild_bktree_index(self): + """Test rebuilding BK-tree index""" + # Rebuild index + count = Phashs.rebuild_bktree_index() + + # Should have indexed all test phashes + self.assertGreaterEqual(count, len(self.test_phashes)) + + # Root should be set + root = r_objects.get('phash:bktree:root') + self.assertIsNotNone(root) + + +class TestCalculatePhash(unittest.TestCase): + """Test perceptual hash calculation""" + + def test_calculate_phash_none_input(self): + """Test phash calculation with None input""" + result = Phashs.calculate_phash(None) + # Should return None if calculation fails + self.assertIsNone(result) + + def test_calculate_phash_invalid_input(self): + """Test phash calculation with invalid input""" + result = Phashs.calculate_phash(b"invalid image data") + # Should return None if calculation fails + self.assertIsNone(result) + + +class TestCreatePhash(unittest.TestCase): + """Test Phash object creation with index""" + + def setUp(self): + """Set up test fixtures""" + self.test_phash = "9999999999999999" + + def tearDown(self): + """Clean up after tests""" + # Clean up test phash object + r_objects.delete(f'meta:phash:{self.test_phash}') + + # Clean up BK-tree + root = r_objects.get('phash:bktree:root') + if root: + r_objects.delete('phash:bktree:root') + for key in r_objects.keys('phash:bktree:*:children'): + r_objects.delete(key) + + def test_create_phash(self): + """Test creating phash with automatic indexing""" + obj = Phashs.create(self.test_phash) + + self.assertIsNotNone(obj) + self.assertTrue(obj.exists()) + + # Check that it was added to BK-tree + root = r_objects.get('phash:bktree:root') + self.assertIsNotNone(root) + + def test_create_phash_duplicate(self): + """Test creating duplicate phash""" + obj1 = Phashs.create(self.test_phash) + obj2 = Phashs.create(self.test_phash) + + # Both should return same object + self.assertEqual(obj1.get_id(), obj2.get_id()) + + +class TestPhashs(unittest.TestCase): + """Test Phashs collection class""" + + def test_phashs_get_name(self): + """Test getting Phashs collection name""" + phashs = Phashs.Phashs() + self.assertEqual(phashs.get_name(), 'Phashs') + + def test_phashs_get_icon(self): + """Test getting Phashs icon""" + phashs = Phashs.Phashs() + icon = phashs.get_icon() + self.assertIsInstance(icon, dict) + self.assertIn('icon', icon) + + def test_phashs_get_link(self): + """Test getting Phashs link""" + phashs = Phashs.Phashs() + link = phashs.get_link() + self.assertIn('phash', link) + + +if __name__ == '__main__': + unittest.main() diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 77aae11d..fd2a5d51 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -59,6 +59,7 @@ from blueprints.objects_etag import objects_etag from blueprints.objects_hhhash import objects_hhhash from blueprints.objects_dom_hash import objects_dom_hash +from blueprints.objects_phash import objects_phash from blueprints.chats_explorer import chats_explorer from blueprints.objects_image import objects_image from blueprints.objects_ocr import objects_ocr @@ -159,6 +160,7 @@ def filter(self, record): app.register_blueprint(objects_etag, url_prefix=baseUrl) app.register_blueprint(objects_hhhash, url_prefix=baseUrl) app.register_blueprint(objects_dom_hash, url_prefix=baseUrl) +app.register_blueprint(objects_phash, url_prefix=baseUrl) app.register_blueprint(chats_explorer, url_prefix=baseUrl) app.register_blueprint(objects_image, url_prefix=baseUrl) app.register_blueprint(objects_ocr, url_prefix=baseUrl) diff --git a/var/www/blueprints/objects_phash.py b/var/www/blueprints/objects_phash.py new file mode 100644 index 00000000..074d35d5 --- /dev/null +++ b/var/www/blueprints/objects_phash.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +''' + Blueprint Flask: Phash objects endpoints +''' + +import os +import sys + +from flask import render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort, send_file +from flask_login import login_required + +# Import Role_Manager +from Role_Manager import login_admin, login_read_only + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects import Phashs +from packages import Date + +# ============ BLUEPRINT ============ +objects_phash = Blueprint('objects_phash', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/phash')) + +# ============ VARIABLES ============ +bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] + + +# ============ FUNCTIONS ============ +@objects_phash.route("/objects/phashes", methods=['GET']) +@login_required +@login_read_only +def objects_phashs(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + show_objects = request.args.get('show_objects') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + + if show_objects: + dict_objects = Phashs.Phashs().api_get_meta_by_daterange(date_from, date_to) + else: + dict_objects = {} + + return render_template("PhashDaterange.html", date_from=date_from, date_to=date_to, + dict_objects=dict_objects, show_objects=show_objects) + +@objects_phash.route("/objects/phash/post", methods=['POST']) +@login_required +@login_read_only +def objects_phashs_post(): + date_from = request.form.get('date_from') + date_to = request.form.get('date_to') + show_objects = request.form.get('show_objects') + return redirect(url_for('objects_phash.objects_phashs', date_from=date_from, date_to=date_to, show_objects=show_objects)) + +@objects_phash.route("/objects/phash/range/json", methods=['GET']) +@login_required +@login_read_only +def objects_phash_range_json(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + return jsonify(Phashs.Phashs().api_get_chart_nb_by_daterange(date_from, date_to)) + +# ============= ROUTES ============== diff --git a/var/www/templates/objects/phash/PhashDaterange.html b/var/www/templates/objects/phash/PhashDaterange.html new file mode 100644 index 00000000..3e03d9e3 --- /dev/null +++ b/var/www/templates/objects/phash/PhashDaterange.html @@ -0,0 +1,611 @@ + + + + + Phashs - AIL + + + + + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'sidebars/sidebar_objects.html' %} + +
+ +
+
+
+ +{#
#} +{#
#} +{#
Search Phash by name:
#} +{#
#} +{#
#} +{# #} +{# #} +{#
#} +{#
#} +{#
#} +{#
#} +
+ + +
+ +
+
+
Select a date range :
+
+
+
+ +
+
+
+ +
+
+ + +
+ +
+
+
+ +
+
+
+
+
+
+ + {% if dict_objects %} + {% if date_from|string == date_to|string %} +

{{ date_from }} Phash:

+ {% else %} +

{{ date_from }} to {{ date_to }} Phash:

+ {% endif %} + + + + + + + + + + + + {% for phash_id in dict_objects %} + + + + + + + + {% endfor %} + +
Phash-IDFirst SeenLast SeenTotalLast days
{{ phash_id }}{{ dict_objects[phash_id]['first_seen'] }}{{ dict_objects[phash_id]['last_seen'] }}{{ dict_objects[phash_id]['nb_seen'] }}
+ + + {% else %} + {% if show_objects %} + {% if date_from|string == date_to|string %} +

{{ date_from }}, No Phash

+ {% else %} +

{{ date_from }} to {{ date_to }}, No Phash

+ {% endif %} + {% endif %} + {% endif %} +
+ +
+
+ + + + + + + + + + + + + + + + +