diff --git a/.github/workflows/ail_framework_test.yml b/.github/workflows/ail_framework_test.yml index 22b9220ba..b8185ceb7 100644 --- a/.github/workflows/ail_framework_test.yml +++ b/.github/workflows/ail_framework_test.yml @@ -17,9 +17,27 @@ jobs: # The type of runner that the job will run on runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] + # TODO: Matrix strategy for Python versions is defined but never used. + # Currently all jobs use the same system Python, making this redundant. + # Either add 'actions/setup-python' to use matrix.python-version, or remove the matrix. + # + # To enable multi-version Python testing: + # + # Step 1: Uncomment the matrix below (defines the Python versions to test): + # strategy: + # matrix: + # python-version: ['3.7', '3.8', '3.9', '3.10'] + # + # Step 2: Add this step after checkout (before "Free up disk space"): + # - name: Set up Python ${{ matrix.python-version }} + # uses: actions/setup-python@v4 + # with: + # python-version: ${{ matrix.python-version }} + # + # ORIGINAL (commented out - not used, makes tests 4x slower with no benefit): + # strategy: + # matrix: + # python-version: ['3.7', '3.8', '3.9', '3.10'] # Steps represent a sequence of tasks that will be executed as part of the job @@ -30,6 +48,26 @@ jobs: submodules: 'recursive' fetch-depth: 500 + # --------------------------------------------- + # NEW STEP: clean up disk BEFORE installing deps + # --------------------------------------------- + - name: Free up disk space + run: | + echo "Disk usage BEFORE cleanup:" + df -h + # Safe: Clear APT cache and lists (can be regenerated) + sudo apt-get clean + sudo rm -rf /var/lib/apt/lists/* + # Probably safe: Remove tools AIL doesn't need (check if exist first) + [ -d /usr/share/dotnet ] && sudo rm -rf /usr/share/dotnet || true + [ -d /opt/ghc ] && sudo rm -rf /opt/ghc || true + [ -d /usr/local/lib/android ] && sudo rm -rf /usr/local/lib/android || true + # Risky but needed: Remove hosted tool cache (contains Python, Node, etc.) + # AIL workflow uses system Python, so this should be safe + [ -d /opt/hostedtoolcache ] && sudo rm -rf /opt/hostedtoolcache || true + echo "Disk usage AFTER cleanup:" + df -h + # --------------------------------------------- # Runs a single command using the runners shell - name: Install AIL diff --git a/.gitignore b/.gitignore index f9bdc4ff0..e41c73ea1 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ PASTES CRAWLED_SCREENSHOT IMAGES FAVICONS +FILES BASE64 HASHS DATA_ARDB diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index f0b25673f..55f8c0116 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -206,6 +206,8 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "D4_client" bash -c "cd ${AIL_BIN}/core; ${ENV_PY} ./D4_client.py; read x" sleep 0.1 + screen -S "Script_AIL" -X screen -t "Translation" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Translation.py; read x" + sleep 0.1 screen -S "Script_AIL" -X screen -t "UpdateBackground" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./update-background.py; read x" sleep 0.1 @@ -619,7 +621,7 @@ function launch_tests() { echo -e $GREEN"\t* Flask: $isflasked"$DEFAULT echo -e "" echo -e "" - python3 -m nose2 --start-dir $tests_dir --coverage $bin_dir --with-coverage test_api test_modules + python3 -m nose2 --start-dir $tests_dir --coverage $bin_dir --with-coverage test_api test_modules test_api_crawler exit $? } diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index a2a0e263e..4ac24fd76 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -281,6 +281,7 @@ def enqueue_capture(self, task_uuid, priority): user_agent=task.get_user_agent(), proxy=task.get_proxy(), cookies=task.get_cookies(), + storage=task.get_local_storage(), with_favicon=True, force=force, general_timeout_in_sec=90) # TODO increase timeout if onion ???? diff --git a/bin/exporter/MailExporter.py b/bin/exporter/MailExporter.py index 00488d8da..4c8ff926e 100755 --- a/bin/exporter/MailExporter.py +++ b/bin/exporter/MailExporter.py @@ -143,7 +143,7 @@ def export(self, tracker, obj, matches=[]): body += f'\nMatch {nb}: {match[0]}\nExtract:\n{match[1]}\n\n' nb += 1 - ail_link = f'AIL url:{obj.get_link()}\n\n' + ail_link = f'AIL url: {obj.get_link()}\n\n' for mail in tracker.get_mails(): if ail_users.exists_user(mail): body = ail_link + body diff --git a/bin/importer/feeders/Default.py b/bin/importer/feeders/Default.py index 71ec07e42..eafb8ae83 100755 --- a/bin/importer/feeders/Default.py +++ b/bin/importer/feeders/Default.py @@ -61,6 +61,9 @@ def get_json_meta(self): def get_meta(self): return self.json_data.get('meta') + def get_meta_field(self, field, default=None): + return self.json_data.get('meta', {}).get(field, default) + def get_payload(self): return self.json_data.get('data') @@ -77,7 +80,7 @@ def get_gzip64_content(self): return self.json_data.get('data') def get_obj_type(self): - meta = self.get_json_meta() + meta = self.get_meta() return meta.get('type', 'item') ## OVERWRITE ME ## diff --git a/bin/importer/feeders/abstract_chats_feeder.py b/bin/importer/feeders/abstract_chats_feeder.py index 90409c002..69a9942d1 100755 --- a/bin/importer/feeders/abstract_chats_feeder.py +++ b/bin/importer/feeders/abstract_chats_feeder.py @@ -12,6 +12,8 @@ import sys import time +import pymupdf4llm + from abc import ABC sys.path.append(os.environ['AIL_BIN']) @@ -20,12 +22,14 @@ ################################## from importer.feeders.Default import DefaultFeeder from lib.ail_core import get_chat_instance_name +from lib.objects import Authors from lib.objects.Chats import Chat from lib.objects import ChatSubChannels from lib.objects import ChatThreads from lib.objects import Images from lib.objects import Items from lib.objects import Messages +from lib.objects import PDFs from lib.objects import FilesNames # from lib.objects import Files from lib.objects import UsersAccount @@ -168,6 +172,8 @@ def get_obj(self): instance_name = get_chat_instance_name(self.get_chat_instance_uuid()) item_id = f'{instance_name}/{d[0:4]}/{d[4:6]}/{d[6:8]}/{self.json_data["data-sha256"]}.gz' self.obj = Items.Item(item_id) + elif obj_type == 'pdf': + self.obj = PDFs.PDF(self.json_data['data-sha256']) else: obj_id = Messages.create_obj_id(self.get_chat_instance_uuid(), chat_id, message_id, timestamp, thread_id=thread_id) self.obj = Messages.Message(obj_id) @@ -191,10 +197,11 @@ def _process_chat(self, meta_chat, date, new_objs=None): #TODO NONE DATE??? if meta_chat.get('icon'): img = Images.create(meta_chat['icon'], b64=True) - img.add(date, chat) - chat.set_icon(img.get_global_id()) - if new_objs: - new_objs.add(img) + if img: + img.add(date, chat) + chat.set_icon(img.get_global_id()) + if new_objs: + new_objs.add(img) if meta_chat.get('username'): username = Username(meta_chat['username'], self.get_chat_protocol()) @@ -225,9 +232,10 @@ def process_chat(self, new_objs, obj, date, timestamp, feeder_timestamp, reply_i if meta.get('icon'): img = Images.create(meta['icon'], b64=True) - img.add(date, chat) - chat.set_icon(img.get_global_id()) - new_objs.add(img) + if img: + img.add(date, chat) + chat.set_icon(img.get_global_id()) + new_objs.add(img) if meta.get('username'): username = Username(meta['username'], self.get_chat_protocol()) @@ -324,9 +332,10 @@ def _process_user(self, meta, date, timestamp, new_objs=None): if meta.get('icon'): img = Images.create(meta['icon'], b64=True) - img.add(date, user_account) - user_account.set_icon(img.get_global_id()) - new_objs.add(img) + if img: + img.add(date, user_account) + user_account.set_icon(img.get_global_id()) + new_objs.add(img) if meta.get('info'): user_account.set_info(meta['info']) @@ -363,9 +372,10 @@ def process_sender(self, new_objs, obj, date, timestamp): if meta.get('icon'): img = Images.create(meta['icon'], b64=True) - img.add(date, user_account) - user_account.set_icon(img.get_global_id()) - new_objs.add(img) + if img: + img.add(date, user_account) + user_account.set_icon(img.get_global_id()) + new_objs.add(img) if meta.get('info'): user_account.set_info(meta['info']) @@ -405,7 +415,8 @@ def process_meta(self): # TODO CHECK MANDATORY FIELDS media_name = self.get_media_name() if media_name: print(media_name) - FilesNames.FilesNames().create(media_name, date, obj) + f = FilesNames.FilesNames().create(media_name, date, obj) + objs.add(f) for reaction in self.get_reactions(): obj.add_reaction(reaction['reaction'], int(reaction['count'])) @@ -430,13 +441,50 @@ def process_meta(self): # TODO CHECK MANDATORY FIELDS if self.obj.type == 'image': obj = Images.create(self.get_message_content()) - obj.add(date, message) - obj.set_parent(obj_global_id=message.get_global_id()) + if obj: + obj.add(date, message) + obj.set_parent(obj_global_id=message.get_global_id()) + + # FILENAME + media_name = self.get_media_name() + if media_name: + f = FilesNames.FilesNames().create(media_name, date, message, file_obj=obj) + objs.add(f) + + elif self.obj.type == 'pdf': + # content + if not self.obj.exists(): + obj = PDFs.create(self.obj.id, self.get_message_content()) + if not obj: + raise Exception('PDF not created, Size limit reached') + obj.set_parent(obj_global_id=message.get_global_id()) + + pdf_meta = self.get_meta_field('file_metadata') + if pdf_meta: + obj.set_file_meta(pdf_meta) + print(pdf_meta) + if 'Author' in pdf_meta: + print(pdf_meta['Author']) + author = Authors.create(pdf_meta['Author'], obj) + author.add(date, obj) + + md_content = pymupdf4llm.to_markdown(obj.get_filepath()) + item_id = f'pdf/{date[0:4]}/{date[4:6]}/{date[6:8]}/{obj.id}.gz' + item = Items.Item(item_id) + if not item.exists(): + item.create(md_content, content_type='str') + objs.add(item) + print(item_id) + obj.add_children('item', '', item_id) + obj.add_correlation('item', '', item_id) + + self.obj.add(date, message) # FILENAME media_name = self.get_media_name() if media_name: - FilesNames.FilesNames().create(media_name, date, message, file_obj=obj) + f = FilesNames.FilesNames().create(media_name, date, message, file_obj=self.obj) + objs.add(f) elif self.obj.type == 'item': obj = self.obj @@ -447,8 +495,9 @@ def process_meta(self): # TODO CHECK MANDATORY FIELDS # FILENAME media_name = self.get_media_name() if media_name: - file_name = FilesNames.FilesNames().create(media_name, date, message, file_obj=obj) + f = file_name = FilesNames.FilesNames().create(media_name, date, message, file_obj=obj) file_name.add_correlation('item', '', obj.id) + objs.add(f) for obj in objs: # TODO PERF avoid parsing metas multiple times diff --git a/bin/lib/Language.py b/bin/lib/Language.py index f59cd3a5d..272f31ff1 100755 --- a/bin/lib/Language.py +++ b/bin/lib/Language.py @@ -5,6 +5,7 @@ import re import logging.config import sys +import time import html2text import gcld3 @@ -17,7 +18,7 @@ ################################## from lib import ail_logger from lib.ConfigLoader import ConfigLoader -from lib.ail_core import get_object_all_subtypes +from lib.ail_core import get_object_all_subtypes, generate_uuid logging.config.dictConfig(ail_logger.get_config(name='ail')) logger = logging.getLogger() @@ -320,6 +321,10 @@ def get_iso_from_languages(l_languages, sort=False): l_iso = sorted(l_iso) return l_iso +def exists_lang_iso_target_source(source, target): + if source not in dict_iso_languages or target not in dict_iso_languages: + return False + return True def get_translator_instance(): return TRANSLATOR_URL @@ -566,8 +571,24 @@ def _get_obj_translation(obj_global_id, language, source=None, content=None, fie def get_obj_translation(obj_global_id, language, source=None, content=None, field='', objs_containers=set()): return _get_obj_translation(obj_global_id, language, source=source, content=content, field=field, objs_containers=objs_containers) +def get_obj_translated_languages(obj_gid): + return r_lang.hkeys(f'tr:{obj_gid}:') + +def get_obj_translated(obj_gid, language_name=False): + translation = r_lang.hgetall(f'tr:{obj_gid}:') + if not language_name: + return translation + else: + translated = {} + for lang_code in translation: + translated[get_language_from_iso(lang_code)] = translation[lang_code] + return translated + +def exists_object_translation_language(obj_gid, target): + return r_lang.hexists(f'tr:{obj_gid}:', target) -# TODO Force to edit ???? +def get_object_translation_language(obj_gid, target): + return r_lang.hget(f'tr:{obj_gid}:', target) def set_obj_translation(obj_global_id, language, translation, field=''): r_cache.delete(f'translation:{language}:{obj_global_id}:') @@ -699,7 +720,7 @@ def detect(self, content): # print('##############################################################') return language[0] - def translate(self, content, source=None, target="eng"): + def translate(self, content, source=None, target="eng", filter_same_content=True): # print(source, target) l_languages = get_translation_languages() if source: @@ -728,10 +749,13 @@ def translate(self, content, source=None, target="eng"): try: # print(source_iso1, target_iso1) translation = self.lt.translate(content, source_iso1, target_iso1) + # Fix libretranslate dot panic + if translation.endswith('........'): + translation = translation.replace('........', '.') except Exception as e: logger.error(f'Libretranslate Translation: {e}') translation = None - if translation == content: + if translation == content and filter_same_content: # print('EQUAL') translation = None return source, translation @@ -761,6 +785,159 @@ def get_translation_languages(): def ping_libretranslate(): return LanguageTranslator().ping() +def translate(content, source, target="eng", filter_same_content=False): + return LanguageTranslator().translate(content, source=source, target=target, filter_same_content=filter_same_content) + +## Translation Task ## + +def get_translation_tasks(): + return r_lang.smembers('tasks:translation') + +def is_translation_task_running(task_uuid): + start = r_lang.hget(f'task:tr:{task_uuid[0]}', 'start') + if start: + start = int(start) + if start + 3600 < int(time.time()): + return False + else: + return True + else: + return False + +def _get_translation_task_to_launch(i_task_uuid): + task_uuid = None + for task_uuid in r_lang.smembers('tasks:translation'): + if task_uuid != i_task_uuid: + if not is_translation_task_running(task_uuid): + return task_uuid + return task_uuid + +def get_translation_task_to_launch(): + task_uuid = r_lang.srandmember('tasks:translation') + if task_uuid: + task_uuid = task_uuid[0] + if not is_translation_task_running(task_uuid): + return task_uuid + else: + return _get_translation_task_to_launch(task_uuid) + else: + return None + +class TranslationTask: + def __init__(self, task_uuid): + self.uuid = task_uuid + + def exists(self): + return r_lang.exists(f'task:tr:{self.uuid}') + + def _get_field(self, field): + return r_lang.hget(f'task:tr:{self.uuid}', field) + + def _set_field(self, field, value): + r_lang.hset(f'task:tr:{self.uuid}', field, value) + + def get_source(self): + return self._get_field('source') + + def get_target(self): + return self._get_field('target') + + def get_progress(self): + return self._get_field('progress') + + def update_time(self): + return self._set_field('time', int(time.time())) + + def update_progress(self, done, total): + if done < 0: + done = 1 + progress = int(done * 100 / total) + if progress == 100: + progress = 99 + self._set_field('progress', progress) + self.update_time() + + def get_object(self): + return self._get_field('object') + + def create(self, obj_gid, source, target): + r_lang.sadd('tasks:translation', self.uuid) + r_lang.sadd(f'tasks:translation:obj:{obj_gid}', self.uuid) + self._set_field('object', obj_gid) + self._set_field('source', source) + self._set_field('target', target) + self._set_field('progress', 0) + + def start(self): + self._set_field('progress', 0) + self._set_field('start', int(time.time())) + + # set as filename for pdf + def complete(self, translation): + set_obj_translation(self.get_object(), self.get_target(), translation) + self.delete() + + def delete(self): + r_lang.srem('tasks:translation', self.uuid) + r_lang.srem(f'tasks:translation:obj:{self.get_object()}', self.uuid) + r_lang.delete(f'task:tr:{self.uuid}') + +def exists_task(obj_gid, source, target): + task_uuid = False + for task_uuid in get_object_tasks_uuid(obj_gid): + task = TranslationTask(task_uuid) + if task.get_source() == source and task.get_target() == target: + task_uuid = task.uuid + break + return task_uuid + +def create_translation_task(obj_gid, source, target, force=False): + task_uuid = exists_task(obj_gid, source, target) + if task_uuid: + if force: + task = TranslationTask(task_uuid) + task.delete() + else: + return task_uuid + task = TranslationTask(generate_uuid()) + task.create(obj_gid, source, target) + return task.uuid + +def get_object_tasks_uuid(obj_gid): + return r_lang.smembers(f'tasks:translation:obj:{obj_gid}') + +def get_object_tasks(obj_gid, language_name=False): + tasks = {} + for task_uuid in get_object_tasks_uuid(obj_gid): + task = TranslationTask(task_uuid) + target = task.get_target() + if language_name: + target = get_language_from_iso(target) + tasks[task_uuid] = {'progress': task.get_progress(), 'target': target} + return tasks + +def api_get_translation_task_progress(task_uuid): + task = TranslationTask(task_uuid) + if not task.exists(): + return {'error': 'Unknown translation task'}, 404 + return task.get_progress(), 200 + +def api_get_object_translation_tasks_progress(tasks_uuid): + tasks = {} + for task_uuid in tasks_uuid: + task = TranslationTask(task_uuid) + if not task.exists(): + return {'error': 'Unknown translation task'}, 404 + tasks[task_uuid] = task.get_progress() + return tasks, 200 + + +def api_delete_translation_task(task_uuid): + task = TranslationTask(task_uuid) + if not task.exists(): + return {'error': 'Unknown translation task'}, 404 + return task.delete(), 200 + if __name__ == '__main__': # t_content = '' diff --git a/bin/lib/Tracker.py b/bin/lib/Tracker.py index 450a7822e..a095b3ded 100755 --- a/bin/lib/Tracker.py +++ b/bin/lib/Tracker.py @@ -731,6 +731,9 @@ def _re_create_tracker(tracker_type, tracker_uuid, to_track, org, user_id, level create_tracker(tracker_type, to_track, org, user_id, level, description=description, filters=filters, tags=tags, mails=mails, webhook=webhook, tracker_uuid=tracker_uuid) +def is_tracker(tracker_uuid): + return Tracker(tracker_uuid).exists() + def get_trackers_types(): return ['word', 'set', 'regex', 'typosquatting', 'yara'] diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index 5d6d978e5..fcd8c8d7f 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -16,22 +16,22 @@ r_object = config_loader.get_db_conn("Kvrocks_Objects") config_loader = None -AIL_OBJECTS = {'barcode', 'chat', 'chat-subchannel', 'chat-thread', 'cookie-name', 'cve', 'cryptocurrency', +AIL_OBJECTS = {'author', 'barcode', 'chat', 'chat-subchannel', 'chat-thread', 'cookie-name', 'cve', 'cryptocurrency', 'decoded', 'domain', 'dom-hash', 'etag', 'favicon', 'file-name', 'gtracker', 'hhhash', 'ip', - 'item', 'image', 'mail', 'message', 'ocr', 'pgp', 'qrcode', 'ssh-key', 'screenshot', 'title', + 'item', 'image', 'mail', 'message', 'ocr', 'pdf', 'pgp', 'qrcode', 'ssh-key', 'screenshot', 'title', 'user-account', 'username'} AIL_OBJECTS_WITH_SUBTYPES = {'chat', 'chat-subchannel', 'cryptocurrency', 'pgp', 'username', 'user-account'} # TODO by object TYPE ???? correlation -AIL_OBJECTS_CORRELATIONS_DEFAULT = {'barcode', 'chat', 'chat-subchannel', 'chat-thread', 'cve', 'cryptocurrency', +AIL_OBJECTS_CORRELATIONS_DEFAULT = {'author', 'barcode', 'chat', 'chat-subchannel', 'chat-thread', 'cve', 'cryptocurrency', 'decoded', 'domain', 'dom-hash', 'favicon', 'file-name', 'gtracker', 'item', - 'image', 'ip', 'mail', 'message', 'ocr', 'pgp', 'qrcode', 'screenshot', + 'image', 'ip', 'mail', 'message', 'ocr', 'pdf', 'pgp', 'qrcode', 'screenshot', 'ssh-key', 'title', 'user-account', 'username'} -AIL_OBJS_QUEUES = {'barcode', 'decoded', 'image', 'item', 'message', 'ocr', 'pgp', 'qrcode', 'screenshot', 'title'} # ADD TAGS ??? +AIL_OBJS_QUEUES = {'barcode', 'decoded', 'file-name', 'image', 'item', 'message', 'ocr', 'pgp', 'qrcode', 'screenshot', 'title'} # ADD TAGS ??? -AIL_OBJS_TRACKED = {'barcode', 'decoded', 'item', 'message', 'ocr', 'pgp', 'qrcode', 'title'} +AIL_OBJS_TRACKED = {'barcode', 'decoded', 'file-name', 'item', 'message', 'ocr', 'pgp', 'qrcode', 'title'} AIL_OBJS_RETRO_HUNTED = {'decoded', 'item', 'message', 'ocr'} # TODO PGP, TITLE @@ -115,6 +115,25 @@ def get_obj_queued(): def get_objects_tracked(): return AIL_OBJS_TRACKED # TODO add new test to check if == sorted() return True +def get_nb_objects_tracked(): + return len(AIL_OBJS_TRACKED) + +def is_tracked_object(obj_type): + return obj_type in AIL_OBJS_TRACKED + +def is_tracked_objects(obj_types): + for obj_type in obj_types: + if not is_tracked_object(obj_type): + return False + return True + +def sanitize_tracked_objects(objs): + l_types = [] + for obj in objs: + if is_tracked_object(obj): + l_types.append(obj) + return l_types + def get_objects_retro_hunted(): return AIL_OBJS_RETRO_HUNTED diff --git a/bin/lib/ail_stats.py b/bin/lib/ail_stats.py index 2c91ff216..765285b3f 100755 --- a/bin/lib/ail_stats.py +++ b/bin/lib/ail_stats.py @@ -45,6 +45,9 @@ def get_feeders(): return r_stats.smembers(f'feeders:name') +def reset_feeders_names(): + r_stats.delete(f'feeders:name') + def get_current_feeder_timestamp(timestamp): return int(timestamp - (timestamp % 30)) diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py index bc7e0e890..1d63192f7 100755 --- a/bin/lib/correlations_engine.py +++ b/bin/lib/correlations_engine.py @@ -41,6 +41,7 @@ ################################## CORRELATION_TYPES_BY_OBJ = { + "author": ["pdf"], "barcode": ["chat", "cve", "cryptocurrency", "decoded", "domain", "image", "message", "screenshot"], "chat": ["barcode", "chat-subchannel", "chat-thread", "cryptocurrency", "cve", "decoded", "domain", "image", "message", "ocr", "pgp", "user-account"], "chat-subchannel": ["chat", "chat-thread", "image", "message", "ocr", "user-account"], @@ -52,16 +53,17 @@ "domain": ["barcode", "chat", "cve", "cookie-name", "cryptocurrency", "dom-hash", "decoded", "etag", "favicon", "gtracker", "hhhash", "item", "mail", "message", "pgp", "screenshot", "ssh-key", "title", "username"], "dom-hash": ["domain", "item"], "etag": ["domain"], - "favicon": ["domain", "item"], # TODO Decoded - "file-name": ["chat", "item", "message"], + "favicon": ["domain", "pdf", "item"], # TODO Decoded + "file-name": ["chat", "item", "message", "pdf"], "gtracker": ["domain", "item"], "hhhash": ["domain"], "image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ???? "ip": ["ssh-key"], - "item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "file-name", "gtracker", "mail", "message", "pgp", "screenshot", "title", "username"], # chat ??? + "item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "file-name", "gtracker", "mail", "message", "pdf", "pgp", "screenshot", "title", "username"], # chat ??? "mail": ["domain", "item", "message"], # chat ?? - "message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "domain", "file-name", "image", "item", "mail", "ocr", "pgp", "user-account"], + "message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "domain", "file-name", "image", "item", "mail", "ocr", "pdf", "pgp", "user-account"], "ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"], + "pdf": ["author", "chat", "file-name", "item", "message"], "pgp": ["chat", "domain", "item", "message", "ocr"], "qrcode": ["chat", "cve", "cryptocurrency", "decoded", "domain", "image", "message", "screenshot"], # "chat-subchannel", "chat-thread" ????? "screenshot": ["barcode", "domain", "item", "qrcode"], @@ -230,6 +232,8 @@ def _get_correlations_graph_node(links, nodes, meta, obj_type, subtype, obj_id, obj_correlations = get_correlations(obj_type, subtype, obj_id, filter_types=filter_types) # print(obj_correlations) + + # add direct correlation for correl_type in obj_correlations: for str_obj in obj_correlations[correl_type]: subtype2, obj2_id = str_obj.split(':', 1) @@ -245,10 +249,14 @@ def _get_correlations_graph_node(links, nodes, meta, obj_type, subtype, obj_id, if len(nodes) > max_nodes != 0: meta['complete'] = False - break + return None nodes.add(obj2_str_id) links.add((obj_str_id, obj2_str_id)) + # level + 1 + for correl_type in obj_correlations: + for str_obj in obj_correlations[correl_type]: + subtype2, obj2_id = str_obj.split(':', 1) if level > 0: next_level = level - 1 _get_correlations_graph_node(links, nodes, meta, correl_type, subtype2, obj2_id, next_level, max_nodes, filter_types=filter_types, objs_hidden=objs_hidden, previous_str_obj=obj_str_id) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index bc1aa6295..475579b4e 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -60,6 +60,7 @@ ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes") HAR_DIR = config_loader.get_files_directory('har') +COOKIEJAR_LOCAL_STORAGE = config_loader.get_files_directory('cookiejar_local_storage') activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler") D_HAR = config_loader.get_config_boolean('Crawler', 'default_har') D_SCREENSHOT = config_loader.get_config_boolean('Crawler', 'default_screenshot') @@ -78,11 +79,14 @@ def api_get_onion_lookup(domain): # TODO check if object process done ??? domain = domain.lower().strip() - parts = domain.split('.onion') - # if len(parts) > 1: - # for word in [part + '.onion' for part in parts[:-1]] + [parts[-1]]: - # if len(word) >= 32 and word.endswith('.onion'): - # api_get_onion_lookup(word) + words = domain.split() + if len(words) > 1: + for word in words: + if '.onion' in word: + domain = word + + if '.onion' not in domain: + return {'error': 'Invalid Onion Domain', 'domain': domain}, 404 url_unpack = unpack_url(domain) if not url_unpack: @@ -114,9 +118,12 @@ def api_get_onion_lookup(domain): # TODO check if object process done ??? del meta['type'] del meta['status'] meta['titles'] = [] - for h in dom.get_correlation('title').get('title', []): - t = Titles.Title(h[1:]) - meta['titles'].append(t.get_content()) + if not Tag.is_tags_safe(tags): + meta['titles'].append("Redacted") + else: + for h in dom.get_correlation('title').get('title', []): + t = Titles.Title(h[1:]) + meta['titles'].append(t.get_content()) return meta def api_get_domain_from_url(url): @@ -803,7 +810,38 @@ def get_cookies(self, r_json=False): def get_nb_cookies(self): return r_crawler.scard(f'cookiejar:cookies:{self.uuid}') - def get_meta(self, level=False, nb_cookies=False, cookies=False, r_json=False): + def get_local_storage_file(self): + return f'{os.path.join(COOKIEJAR_LOCAL_STORAGE, self.uuid)}.gz' + + def exists_local_storage(self): # TODO SPLIT in multiple directory ????? + return os.path.isfile(self.get_local_storage_file()) + + def get_local_storage(self, r_json=False): + try: + with gzip.open(self.get_local_storage_file()) as f: + try: + storage = json.loads(f.read()) + if r_json: + return json.dumps(storage, indent=2) + else: + return storage + except json.decoder.JSONDecodeError: + return {} + except Exception as e: + print(e) # TODO LOGS + return {} + + def set_local_storage(self, storage): # TODO check if file already exists + with gzip.open(self.get_local_storage_file(), 'w+') as f: + f.write(json.dumps(storage).encode()) + + def delete_local_storage(self): + try: + os.remove(self.get_local_storage_file()) + except Exception as e: + print(e) + + def get_meta(self, level=False, nb_cookies=False, cookies=False, local_storage=False, r_json=False): meta = {'uuid': self.uuid, 'date': self.get_date(), 'description': self.get_description(), @@ -816,6 +854,8 @@ def get_meta(self, level=False, nb_cookies=False, cookies=False, r_json=False): meta['nb_cookies'] = self.get_nb_cookies() if cookies: meta['cookies'] = self.get_cookies(r_json=r_json) + if local_storage: + meta['local_storage'] = self.get_local_storage(r_json=r_json) return meta def add_cookie(self, name, value, cookie_uuid=None, domain=None, httponly=None, path=None, secure=None, text=None): @@ -867,6 +907,7 @@ def create(self, user_org, user_id, level, description=None): def delete(self): for cookie_uuid in self.get_cookies_uuid(): self.delete_cookie(cookie_uuid) + self.delete_local_storage() r_crawler.srem(f'cookiejars:user:{self.get_user()}', self.uuid) r_crawler.srem('cookiejars:global', self.uuid) r_crawler.srem('cookiejars:all', self.uuid) @@ -920,6 +961,18 @@ def api_edit_cookiejar_description(user_org, user_id, user_role, cookiejar_uuid, cookiejar.set_description(description) return {'cookiejar_uuid': cookiejar_uuid}, 200 +def api_delete_cookiejar_local_storage(user_org, user_id, user_role, cookiejar_uuid): + resp = api_check_cookiejar_access_acl(cookiejar_uuid, user_org, user_id, user_role, 'edit') + if resp: + return resp + cookiejar = Cookiejar(cookiejar_uuid) + if not cookiejar.exists(): + return {'error': 'unknown cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404 + if not cookiejar.exists_local_storage(): + return {'error': 'local storage do not exists', 'cookiejar_uuid': cookiejar_uuid}, 404 + cookiejar.delete_local_storage() + return {'cookiejar_uuid': cookiejar_uuid}, 200 + def api_delete_cookiejar(user_org, user_id, user_role, cookiejar_uuid): resp = api_check_cookiejar_access_acl(cookiejar_uuid, user_org, user_id, user_role, 'delete') if resp: @@ -933,7 +986,7 @@ def api_get_cookiejar(user_org, user_id, user_role, cookiejar_uuid): if resp: return resp cookiejar = Cookiejar(cookiejar_uuid) - meta = cookiejar.get_meta(level=True, cookies=True, r_json=True) + meta = cookiejar.get_meta(level=True, cookies=True, local_storage=True, r_json=True) return meta, 200 #### ACL #### @@ -947,6 +1000,41 @@ def api_check_cookiejar_access_acl(cookiejar_uuid, user_org, user_id, user_role, #### API #### + +######################################################################### + +# TODO edit existing cookiejat local storage +def api_import_lacus_cookiejar(user_org, user_id, data, cookiejar_uuid=None): + url = data.get('url') + storage = data.get('storage') + + if not url: + return {'error': 'url not set'}, 400 + if not storage: + return {'error': 'lacus storage not set'}, 400 + + cookiejar_uuid = None # TODO edit/replace cookiejar + cookies = storage.get('cookies') + origins = storage.get('origins') + + if not cookies and not origins: + return {'error': 'No cookies or local storage to import'}, 400 + + # TODO check if is valid JSON + + # TODO extract DOMAIN + + # Create new cookiejar + if not cookiejar_uuid: + cookiejar_uuid = create_cookiejar(user_org, user_id, f"{url} - imported from lacus", 1, None) + cookiejar = Cookiejar(cookiejar_uuid) + + cookiejar.set_local_storage(storage) + + return {'cookiejar_uuid': cookiejar_uuid}, 200 + +######################################################################### + # # # # # # # # # # # COOKIES # @@ -1071,6 +1159,8 @@ def api_create_cookie(user_org, user_id, user_role, cookiejar_uuid, cookie_dict) resp = api_check_cookiejar_access_acl(cookiejar_uuid, user_org, user_id, user_role, 'edit') if resp: return resp + if not cookie_dict: + return {'error': 'no cookies provided'}, 400 if 'name' not in cookie_dict or 'value' not in cookie_dict or not cookie_dict['name'] or not cookie_dict['value']: return {'error': 'cookie name or value not provided'}, 400 cookiejar = Cookiejar(cookiejar_uuid) @@ -2040,6 +2130,14 @@ def get_cookies(self): else: return [] + def get_local_storage(self): + cookiejar = self.get_cookiejar() + if cookiejar: + cookiejar = Cookiejar(cookiejar) + return cookiejar.get_local_storage() + else: + return None + def get_header(self): return r_crawler.hget(f'crawler:task:{self.uuid}', 'header') diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py index 6e9a6a8fd..92d3fa789 100755 --- a/bin/lib/item_basic.py +++ b/bin/lib/item_basic.py @@ -225,9 +225,8 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt # empty directory if not l_dir: if source_name: - return l_sources_name.add(source_name) - else: - return l_sources_name + l_sources_name.add(source_name) + return l_sources_name else: for src_name in l_dir: if len(src_name) == 4 and source_name: diff --git a/bin/lib/module_extractor.py b/bin/lib/module_extractor.py index 14ef7de30..ab98a556e 100755 --- a/bin/lib/module_extractor.py +++ b/bin/lib/module_extractor.py @@ -37,7 +37,7 @@ r_cache = config_loader.get_redis_conn("Redis_Cache") config_loader = None -r_key = regex_helper.generate_redis_cache_key('extractor') +r_key = regex_helper.generate_redis_cache_key('extractor') # TODO MOVE IN extractor function # SIGNAL ALARM @@ -121,7 +121,7 @@ def get_correl_match(extract_type, obj, content): value_id = map_value_id.get(sha256_val) if not value_id: # logger.critical(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}') - print(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}') + # print(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}') value_id = 'ERROR' extracted.append([ob[0], ob[1], ob[2], f'{extract_type}:{subtype}:{value_id}']) return extracted @@ -147,18 +147,14 @@ def convert_byte_offset_to_string(b_content, offset): return offset except UnicodeDecodeError as e: # logger.error(f'Yara offset converter error, {str(e)}\n{offset}/{len(b_content)}') - print(f'Yara offset converter error, {str(e)}\n{offset}/{len(b_content)}') + # print(f'Yara offset converter error, {str(e)}\n{offset}/{len(b_content)}') return convert_byte_offset_to_string(b_content, offset - 1) -# TODO RETRO HUNTS -# TODO TRACKER TYPE IN UI -def get_tracker_match(user_org, user_id, obj, content): +def _get_trackers_match(trackers_uuids, user_org, user_id, obj_gid, content, priority=None): extracted = [] extracted_yara = [] - obj_gid = obj.get_global_id() - trackers = Tracker.get_obj_trackers(obj.type, obj.get_subtype(r_str=True), obj.id) - for tracker_uuid in trackers: + for tracker_uuid in trackers_uuids: tracker = Tracker.Tracker(tracker_uuid) if not tracker.check_level(user_org, user_id): continue @@ -174,9 +170,8 @@ def get_tracker_match(user_org, user_id, obj, content): rule = tracker.get_rule() rule.match(data=content.encode(), callback=_get_yara_match, which_callbacks=yara.CALLBACK_MATCHES, timeout=5) - yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}') + yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}') # set in _get_yara_match callback r_cache.delete(f'extractor:yara:match:{r_key}') - extracted = [] for match in yara_match: start, end, value = match.split(':', 2) extracted_yara.append([int(start), int(end), value, f'tracker:{tracker.uuid}']) @@ -193,10 +188,12 @@ def get_tracker_match(user_org, user_id, obj, content): # print(regex_match) for match in regex_match: extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}']) + return extracted, extracted_yara + - # Retro Hunt - retro_hunts = Tracker.get_obj_retro_hunts(obj.type, obj.get_subtype(r_str=True), obj.id) - for retro_uuid in retro_hunts: +def _extract_retro_hunts(retro_hunts_uuids, user_org, content, priority=None): + extracted_yara = [] + for retro_uuid in retro_hunts_uuids: retro_hunt = Tracker.RetroHunt(retro_uuid) if not retro_hunt.check_level(user_org): continue @@ -209,12 +206,45 @@ def get_tracker_match(user_org, user_id, obj, content): rule.match(data=content.encode(), callback=_get_yara_match, which_callbacks=yara.CALLBACK_MATCHES, timeout=5) - yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}') + yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}') # set in _get_yara_match callback r_cache.delete(f'extractor:yara:match:{r_key}') - extracted = [] for match in yara_match: start, end, value = match.split(':', 2) extracted_yara.append([int(start), int(end), value, f'retro_hunt:{retro_hunt.uuid}']) + return extracted_yara + + +# TODO TRACKER TYPE IN UI +def get_tracker_match(user_org, user_id, obj, content, priority=None, match_uuid=None): + obj_gid = obj.get_global_id() + + if match_uuid: + extracted = [] + if Tracker.is_tracker(match_uuid): + extracted, extracted_yara = _get_trackers_match([match_uuid], user_org, user_id, obj_gid, content) + # retro_hunt + else: + extracted_yara = _extract_retro_hunts([match_uuid], user_org, content) + + else: + trackers_uuids = Tracker.get_obj_trackers(obj.type, obj.get_subtype(r_str=True), obj.id) + retro_hunts_uuids = Tracker.get_obj_retro_hunts(obj.type, obj.get_subtype(r_str=True), obj.id) + + # check if priority is tracker or retro + if priority: + if priority in trackers_uuids: + extracted, extracted_yara = _get_trackers_match(trackers_uuids, user_org, user_id, obj_gid, content, priority=priority) + extracted_retro_yara = _extract_retro_hunts(retro_hunts_uuids, user_org, content, priority=priority) + else: + extracted_retro_yara = _extract_retro_hunts(retro_hunts_uuids, user_org, content, priority=priority) + extracted, extracted_yara = _get_trackers_match(trackers_uuids, user_org, user_id, obj_gid, content) + else: + extracted, extracted_yara = _get_trackers_match(trackers_uuids, user_org, user_id, obj_gid, content) + extracted_retro_yara = _extract_retro_hunts(retro_hunts_uuids, user_org, content) + if extracted_yara and extracted_retro_yara: + extracted_yara[0:0] = extracted_retro_yara + elif extracted_retro_yara: + extracted_yara = extracted_retro_yara # Convert byte offset to string offset if extracted_yara: @@ -226,14 +256,13 @@ def get_tracker_match(user_org, user_id, obj, content): start = convert_byte_offset_to_string(b_content, yara_m[0]) end = convert_byte_offset_to_string(b_content, yara_m[1]) extracted.append([int(start), int(end), yara_m[2], yara_m[3]]) - return extracted + # Type:subtype:id # tag:iban # tracker:uuid -# def extract(obj_id, content=None): -def extract(user_id, obj_type, subtype, obj_id, content=None): +def extract(user_id, obj_type, subtype, obj_id, content=None, priority=None, match_uuid=None): obj = ail_objects.get_object(obj_type, subtype, obj_id) if not obj.exists(): return [] @@ -252,21 +281,22 @@ def extract(user_id, obj_type, subtype, obj_id, content=None): try: if not content: content = obj.get_content() - extracted = get_tracker_match(user_org, user_id, obj, content) - # print(item.get_tags()) - for tag in obj.get_tags(): - if MODULES.get(tag): - # print(tag) - module = MODULES.get(tag) - matches = module.extract(obj, content, tag) + extracted = get_tracker_match(user_org, user_id, obj, content, match_uuid=match_uuid) + if not match_uuid: + # print(item.get_tags()) + for tag in obj.get_tags(): + if MODULES.get(tag): + # print(tag) + module = MODULES.get(tag) + matches = module.extract(obj, content, tag) + if matches: + extracted = extracted + matches + + for obj_t in CORRELATION_TO_EXTRACT[obj.type]: + matches = get_correl_match(obj_t, obj, content) if matches: extracted = extracted + matches - for obj_t in CORRELATION_TO_EXTRACT[obj.type]: - matches = get_correl_match(obj_t, obj, content) - if matches: - extracted = extracted + matches - # SORT By Start Pos if extracted: extracted = sorted(extracted, key=itemgetter(0)) @@ -325,7 +355,7 @@ def get_extracted_by_match(extracted): matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id) except TypeError: # logger.critical(f'module extractor invalid object: {ob_type} : {subtype} : {obj_id}') - print(f'module extractor invalid object: {ob_type} : {subtype} : {obj_id}') + # print(f'module extractor invalid object: {ob_type} : {subtype} : {obj_id}') matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf00d', 'color': 'red', 'radius': 5} matches[str_obj]['link'] = '' diff --git a/bin/lib/objects/Authors.py b/bin/lib/objects/Authors.py new file mode 100755 index 000000000..1f1163c6d --- /dev/null +++ b/bin/lib/objects/Authors.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +from hashlib import sha256 +from pymisp import MISPObject + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects +from lib.ConfigLoader import ConfigLoader +from packages import Date +# from lib.data_retention_engine import update_obj_date, get_obj_date_first + +from flask import url_for + +config_loader = ConfigLoader() +r_object = config_loader.get_db_conn("Kvrocks_Objects") +r_cache = config_loader.get_redis_conn("Redis_Cache") +baseurl = config_loader.get_config_str("Notifications", "ail_domain") +IMAGE_FOLDER = config_loader.get_files_directory('images') +config_loader = None + +class Author(AbstractDaterangeObject): + """ + AIL Author Object. (strings) + """ + + def __init__(self, id): + super(Author, self).__init__('author', id) + + def get_content(self, r_type='str'): + """ + Returns content + """ + global_id = self.get_global_id() + content = r_cache.get(f'content:{global_id}') + if not content: + content = self._get_field('content') + # Set Cache + if content: + global_id = self.get_global_id() + r_cache.set(f'content:{global_id}', content) + r_cache.expire(f'content:{global_id}', 300) + if r_type == 'str': + return content + elif r_type == 'bytes': + if content: + return content.encode() + + def get_date(self): # TODO + return Date.get_today_date_str() + + def get_nb_seen(self): + return self.get_nb_correlation('pdf') + + def get_source(self): + """ + Returns source/feeder name + """ + return 'author' + + def get_basename(self): + return 'author' + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('correlation.show_correlation', type=self.type, id=self.id) + else: + url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' + return url + + def get_svg_icon(self): + return {'style': 'fas', 'icon': '\uf4ff', 'color': 'grey', 'radius': 5} + + def get_misp_object(self): # TODO + pass + # obj = MISPObject('instant-message', standalone=True) + # obj_date = self.get_date() + # if obj_date: + # obj.first_seen = obj_date + # else: + # self.logger.warning( + # f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={obj_date}') + # + # # obj_attrs = [obj.add_attribute('first-seen', value=obj_date), + # # obj.add_attribute('raw-data', value=self.id, data=self.get_raw_content()), + # # obj.add_attribute('sensor', value=get_ail_uuid())] + # obj_attrs = [] + # for obj_attr in obj_attrs: + # for tag in self.get_tags(): + # obj_attr.add_tag(tag) + # return obj + + # options: set of optional meta fields + def get_meta(self, options=None): + """ + :type options: set + """ + if options is None: + options = set() + meta = self._get_meta(options=options) + meta['tags'] = self.get_tags() + meta['content'] = self.get_content() + return meta + + def create(self, content, obj_authored, tags=[]): + self._set_field('content', content) + self._copy_from(obj_authored.type, obj_authored.get_id()) + for tag in tags: + self.add_tag(tag) + return self.id + + # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ + def delete(self): + r_object.delete(f'author:{self.id}') + + +def create(content, obj_authored, tags=[]): + if content: + obj_id = sha256(content.encode()).hexdigest() + obj = Author(obj_id) + if not obj.exists(): + obj.create(content, obj_authored, tags=tags) + return obj + +class Authors(AbstractDaterangeObjects): + """ + Barcodes Objects + """ + def __init__(self): + super().__init__('author', Author) + + def get_name(self): + return 'Authors' + + def get_icon(self): + return {'fa': 'fas', 'icon': 'user-pen'} + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('objects_author.objects_authors') + else: + url = f'{baseurl}/objects/authors' + return url + + def sanitize_id_to_search(self, name_to_search): + return name_to_search # TODO + + +#### API #### +def api_get_author(obj_id): + obj = Author(obj_id) + if not obj.exists(): + return {"status": "error", "reason": "Unknown author"}, 404 + meta = obj.get_meta({'content', 'icon', 'link'}) + return meta, 200 diff --git a/bin/lib/objects/Decodeds.py b/bin/lib/objects/Decodeds.py index 3339883cf..b781e66a1 100755 --- a/bin/lib/objects/Decodeds.py +++ b/bin/lib/objects/Decodeds.py @@ -131,9 +131,12 @@ def get_content(self, mimetype=None, r_type='str'): else: return b'' if r_type == 'str': - with open(filepath, 'r') as f: - content = f.read() - return content + try: + with open(filepath, 'r') as f: + content = f.read() + return content + except UnicodeDecodeError: + return '' elif r_type == 'bytes': with open(filepath, 'rb') as f: content = f.read() diff --git a/bin/lib/objects/FilesNames.py b/bin/lib/objects/FilesNames.py index a2fa33660..544602739 100755 --- a/bin/lib/objects/FilesNames.py +++ b/bin/lib/objects/FilesNames.py @@ -16,6 +16,7 @@ config_loader = ConfigLoader() r_object = config_loader.get_db_conn("Kvrocks_Objects") +baseurl = config_loader.get_config_str("Notifications", "ail_domain") config_loader = None @@ -59,10 +60,17 @@ def get_misp_object(self): obj_attr.add_tag(tag) return obj + def get_pdf(self): + pdfs = self.get_correlation('pdf').get('pdf', set()) + if len(pdfs) == 1: + return pdfs.pop()[1:] + def get_meta(self, options=set()): meta = self._get_meta(options=options) meta['id'] = self.id meta['tags'] = self.get_tags(r_list=True) + if 'pdf' in options: + meta['pdf'] = self.get_pdf() if 'tags_safe' in options: meta['tags_safe'] = self.is_tags_safe(meta['tags']) return meta diff --git a/bin/lib/objects/Images.py b/bin/lib/objects/Images.py index aa8c31961..0647ee847 100755 --- a/bin/lib/objects/Images.py +++ b/bin/lib/objects/Images.py @@ -97,6 +97,7 @@ def get_description_models(self): if key.startswith('desc:'): model = key[5:] models.append(model) + return models def add_description_model(self, model, description): self._set_field(f'desc:{model}', description) diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index 658357bb8..b7d0e51e0 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -202,18 +202,30 @@ def get_files(self, file_names=None): files = {} nb_files = 0 s_files = set() + # TODO PERF for file_name in file_names: + # item for it in self.get_correlation_iter('file-name', '', file_name, 'item'): if file_name not in files: files[file_name] = [] - files[file_name].append({'obj': it[1:], 'tags': self.get_obj_tags('item', '', it[1:])}) + files[file_name].append({'type': 'item', 'subtype': '', 'id': it[1:], 'tags': self.get_obj_tags('item', '', it[1:])}) + s_files.add(it[1:]) + nb_files += 1 + # pdf + for it in self.get_correlation_iter('file-name', '', file_name, 'pdf'): + if file_name not in files: + files[file_name] = [] + files[file_name].append({'type': 'pdf', 'subtype': '', 'id': it[1:], 'tags': self.get_obj_tags('pdf', '', it[1:])}) s_files.add(it[1:]) nb_files += 1 if nb_files < self.get_nb_files(): files['undefined'] = [] for f in self.get_correlation('item').get('item'): if f[1:] not in s_files: - files['undefined'].append({'obj': f[1:], 'tags': self.get_obj_tags('item', '', f[1:])}) + files['undefined'].append({'type': 'item', 'subtype': '', 'id': f[1:], 'tags': self.get_obj_tags('item', '', f[1:])}) + for f in self.get_correlation('pdf').get('pdf'): + if f[1:] not in s_files: + files['undefined'].append({'type': 'pdf', 'subtype': '', 'id': f[1:], 'tags': self.get_obj_tags('pdf', '', f[1:])}) return files def get_reactions(self): diff --git a/bin/lib/objects/PDFs.py b/bin/lib/objects/PDFs.py new file mode 100755 index 000000000..43465b7d8 --- /dev/null +++ b/bin/lib/objects/PDFs.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import base64 +import os +import sys +import time + +import pymupdf +import html2text + +from io import BytesIO +from shapely.geometry import box +from flask import url_for +from pymisp import MISPObject + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader +from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects +from lib import Language +from packages import Date +# from lib.ail_core import get_default_image_description_model + +config_loader = ConfigLoader() +r_cache = config_loader.get_redis_conn("Redis_Cache", decode_responses=False) +r_serv_metadata = config_loader.get_db_conn("Kvrocks_Objects") +PDF_FOLDER = os.path.join(config_loader.get_files_directory('files'), 'pdf') +PDF_MAX_SIZE = config_loader.get_config_int('Directories', 'max_pdf_size') # bytes +PDF_TRANSLATED_DIR = config_loader.get_files_directory('translated_pdf') +if not os.path.isdir(PDF_TRANSLATED_DIR): + os.makedirs(PDF_TRANSLATED_DIR) +PDF_TRANSLATED_TTL = config_loader.get_config_int('Directories', 'pdf_translation_ttl') +baseurl = config_loader.get_config_str("Notifications", "ail_domain") +config_loader = None + + +def is_bboxs_overlapping(bbox1, bbox2): + b1 = box(0, bbox1[1], 1, bbox1[3] - 2) + b2 = box(0, bbox2[1], 1, bbox2[3]) + return b1.intersects(b2) + + +class PDF(AbstractDaterangeObject): + """ + AIL PDF Object. + """ + + # ID = SHA256 + def __init__(self, obj_id): + super(PDF, self).__init__('pdf', obj_id) + + # def get_ail_2_ail_payload(self): + # payload = {'raw': self.get_gzip_content(b64=True), + # 'compress': 'gzip'} + # return payload + + # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ + def delete(self): + # # TODO: + pass + + def exists(self): + return os.path.isfile(self.get_filepath()) + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('objects_pdf.pdf_view', id=self.id) + else: + url = f'/pdf/view?id={self.id}' + return url + + def get_svg_icon(self): + return {'style': 'far', 'icon': '\uf1c1', 'color': '#cc6600', 'radius': 5} + + def get_rel_path(self): + rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:]) + return rel_path + + def get_filepath(self): + filename = os.path.join(PDF_FOLDER, self.get_rel_path()) + return os.path.realpath(filename) + + def get_file_content(self): + filepath = self.get_filepath() + with open(filepath, 'rb') as f: + file_content = BytesIO(f.read()) + return file_content + + def get_base64(self): + return base64.b64encode(self.get_file_content().read()).decode() + + def get_content(self, r_type='str'): + if r_type == 'str': + return None + else: + return self.get_file_content() + + def get_markdown_id(self): + return self.get_correlation('item').get('item', []).pop()[1:] + + def get_author(self): + author = self.get_correlation('author').get('author', []) + if author: + return author.pop()[1:] + return None + + def get_file_names(self): + file_names = [] + for f in self.get_correlation('file-name').get('file-name', []): + file_names.append(f[1:]) + return file_names + + def get_translated(self, language_name=False): + obj_gid = self.get_global_id() + translated = Language.get_obj_translated(obj_gid, language_name=language_name) + task = Language.get_object_tasks(obj_gid, language_name=language_name) + return {'translated': translated, 'task': task} + + def get_misp_object(self): # TODO + obj_attrs = [] + obj = MISPObject('file') + + obj_attrs.append(obj.add_attribute('sha256', value=self.id)) + obj_attrs.append(obj.add_attribute('attachment', value=self.id, data=self.get_file_content())) + for obj_attr in obj_attrs: + for tag in self.get_tags(): + obj_attr.add_tag(tag) + return obj + + def get_meta(self, options=set(), flask_context=False): + meta = self._get_meta(options=options, flask_context=flask_context) + meta['id'] = self.id + meta['tags'] = self.get_tags(r_list=True) + if 'content' in options: + meta['content'] = self.get_content() + if 'tags_safe' in options: + meta['tags_safe'] = self.is_tags_safe(meta['tags']) + if 'author' in options: + meta['author'] = self.get_author() + if 'file-names' in options: + meta['file-names'] = self.get_file_names() + if 'markdown_id' in options: + meta['markdown_id'] = self.get_markdown_id() + if 'translated' in options: + meta['translated'] = self.get_translated(language_name=True) + return meta + + def translate(self, task, source, target): # TODO harmonize + + filename = Language.exists_object_translation_language(self.get_global_id(), target) + if filename: + return filename + + doc = pymupdf.open(self.get_filepath()) + done = 0 + total = doc.page_count + ocg_xref = doc.add_ocg("Translated", on=True) + ocg_tab = doc.add_ocg("Table", on=True) + h = html2text.HTML2Text() + h.ignore_links = True + + # p = 0 + for page in doc: + # p += 1 + # if p != 31: + # continue + tabs = page.find_tables() # detect the tables + tabs_extracted = {} + html_box_tables = [] + for tab in tabs: + # print(tab) + rows_text = tab.extract() + # check if is a real table + nb_cell = 0 + none_column = False + for column in rows_text: + nb_column = 0 + for v in column: + if v: + nb_column += 1 + if nb_column < 1: + none_column = True + break + else: + nb_cell += nb_column + if nb_cell > 1 and not none_column: + tabs_extracted[str(tab.bbox)] = rows_text + current_row = 0 + # table coord -> tab.bbox + for row in tab.rows: + i = 0 + for cell in row.cells: + if cell: + original = rows_text[current_row][i] + if original: + _, translated = Language.translate(original.strip(), source=source, target=target, filter_same_content=False) + if translated: + translated = translated.strip() + if translated: + translated = translated.replace('\n', '\\n') + translated = h.handle(translated.strip()).replace('\\n', '
').replace('\n', ' ').replace('\\.', '.') + html_box_tables.append((cell, translated)) + i += 1 + current_row += 1 + # TODO TAB HEADERS + # print(tab.header.external) + # if tab.header.external: + # print('EXTERNAL --------------------------------') + # print(tab.header.bbox) + # print(tab.header.names) + # for cell in tab.header.cells: + # print(cell) + + blocks = page.get_text('blocks', flags=pymupdf.TEXT_DEHYPHENATE) + for block in blocks: + bbox = block[:4] + original = block[4] + if original: + original = "\n".join(" ".join(line.split()) for line in original.splitlines()) + original = original.strip() + original = original.replace(' \n', '\n') + is_overlapping = False + if tabs and original: + l_overlapp = [] + for tab in tabs: + if str(tab.bbox) in tabs_extracted: + # tab y <= + # text in table + if tab.bbox[1] <= bbox[1] and bbox[3] <= tab.bbox[3] + 2: + is_overlapping = True + break + if is_bboxs_overlapping(tab.bbox, bbox): + l_overlapp.append(tab) + if len(l_overlapp) == 1: + tab = l_overlapp[0] + # filter start + end + if tab.bbox[1] > bbox[1] + 2 and tab.bbox[3] < bbox[3] - 2: + pass + + # Text start + elif tab.bbox[1] > bbox[1]: + tab_extract = tabs_extracted[str(tab.bbox)][0][0] + if tab_extract: + if original.startswith(tab_extract): + is_overlapping = True + else: + y2 = tab.bbox[1] - 1 + original = original.split(tab_extract, 1)[0] + if original: + original = original.strip() + bbox = (bbox[0], bbox[1], bbox[2], y2) + # Text end + elif tab.bbox[3] < bbox[3]: + tab_extract = tabs_extracted[str(tab.bbox)][-1][-1] + if tab_extract: + if original.endswith(tab_extract): + is_overlapping = True + else: + original = original.rsplit(tab_extract, 1) + if len(original) == 2: + original = original[1] + y1 = tab.bbox[3] + 1 + bbox = (bbox[0], y1, bbox[2], bbox[3]) + else: + is_overlapping = True + + elif len(l_overlapp) == 2: + first_tab = None + last_tab = None + for tab in l_overlapp: + if tab.bbox[1] < bbox[1] and tab.bbox[3] < bbox[3]: + first_tab = tab.bbox + elif tab.bbox[1] > bbox[1] and tab.bbox[3] < bbox[3]: + last_tab = tab.bbox + if first_tab and last_tab: + # remove first tab + first_tab_last_cell = tabs_extracted[str(first_tab)][-1][-1] + if first_tab_last_cell: + original = original.split(first_tab_last_cell, 1)[1] + if original: + original = original.strip() + bbox = (bbox[0], first_tab[3] + 1, bbox[2], bbox[3]) + # remove last tab + last_tab_first_row = tabs_extracted[str(last_tab)][0] + last_tab_first_row = ['' if x is None else x for x in last_tab_first_row] + last_tab_first_row = '\n'.join(last_tab_first_row) + if last_tab_first_row: + original = original.rsplit(last_tab_first_row)[0] + bbox = (bbox[0], bbox[1], bbox[2], last_tab[1] - 1) + + if not is_overlapping and original: + _, translated = Language.translate(original, source=source, target=target) + page.draw_rect(bbox, color=None, fill=pymupdf.pdfcolor['white'], oc=ocg_xref) + # print(translated) + if translated: + translated = translated.strip() + if translated: + translated = translated.replace('\n', '\\n') + translated = h.handle(translated.strip()).replace('\\n', '
').replace('\n', ' ').replace('\\.', '.') + page.draw_rect(bbox, color=None, fill=pymupdf.pdfcolor['white'], oc=ocg_xref) + page.insert_htmlbox(bbox, translated, oc=ocg_xref) + + # add table + if html_box_tables: + for cell, translated in html_box_tables: + page.draw_rect(cell, color=None, fill=pymupdf.pdfcolor['white'], oc=ocg_tab) + page.insert_htmlbox(cell, translated, oc=ocg_tab) + done += 1 + print(done) + task.update_progress(done, total) + + print(task) + + # Save translated PDF + # translated = doc.tobytes(garbage=0, deflate=True) + filename = f'{target}_{int(time.time())}_{self.id}.pdf' + doc.ez_save(os.path.join(PDF_TRANSLATED_DIR, filename)) + # doc.subset_fonts() ???? reduce size ??? + # doc.ez_save("orca-korean.pdf") + + task.complete(filename) + return filename + + def delete_translated(self, target): + obj_gid = self.get_global_id() + filename = Language.get_object_translation_language(obj_gid, target) + if filename: + Language.delete_obj_translation(obj_gid, target) + os.remove(os.path.join(PDF_TRANSLATED_DIR, filename)) + + def create(self, content): + filepath = self.get_filepath() + dirname = os.path.dirname(filepath) + if not os.path.exists(dirname): + os.makedirs(dirname) + with open(filepath, 'wb') as f: + f.write(content) + + +def get_all_pdfs(): + objs = [] + for root, dirs, files in os.walk(PDF_FOLDER): + for file in files: + path = f'{root}{file}' + obj_id = path.replace(PDF_FOLDER, '').replace('/', '') + objs.append(obj_id) + return objs + + +def get_all_pdfs_objects(filters={}): + for obj_id in get_all_pdfs(): + yield PDF(obj_id) + +# obj_id -> original pdf sha256 +def create(obj_id, content, size_limit=PDF_MAX_SIZE, b64=False, force=False): + size = (len(content)*3) / 4 + if size <= size_limit or size_limit < 0 or force: + if b64: + content = base64.standard_b64decode(content.encode()) + obj = PDF(obj_id) + if not obj.exists(): + obj.create(content) + return obj + +def delete_translated_pdfs(): + for filename in os.listdir(PDF_TRANSLATED_DIR): + if os.path.isfile(os.path.join(PDF_TRANSLATED_DIR, filename)): + target, t, obj_id = filename.split('_', 2) + obj_id = obj_id[:-4] + print('deleted:', filename) + Language.delete_obj_translation(f'pdf::{obj_id}', target) + os.remove(os.path.join(PDF_TRANSLATED_DIR, filename)) + +def ttl_translated_pdfs(): + for filename in os.listdir(PDF_TRANSLATED_DIR): + if os.path.isfile(os.path.join(PDF_TRANSLATED_DIR, filename)): + target, t, obj_id = filename.split('_', 2) + t = int(t) + obj_id = obj_id[:-4] + nb = Date.get_nb_days_by_daterange(Date.get_date_from_timestamp(t), Date.get_today_date_str()) + if nb >= PDF_TRANSLATED_TTL: + print('deleted:', filename) + Language.delete_obj_translation(f'pdf::{obj_id}', target) + os.remove(os.path.join(PDF_TRANSLATED_DIR, filename)) + +def api_get_meta(obj_id, options=set(), flask_context=False): + obj = PDF(obj_id) + if not obj.exists(): + return {'error': 'PDF Not Found'}, 404 + return obj.get_meta(options=options, flask_context=flask_context), 200 + +def api_exists_translation_file(filename): + filename = os.path.basename(filename) + if not os.path.isfile(os.path.join(PDF_TRANSLATED_DIR, filename)): + return {'error': 'No Translation Found or Expired. Please Launch a new translation'}, 404 + return filename, 200 + +def api_create_translation_task(obj_id, source, target, force=False): + obj = PDF(obj_id) + if not obj.exists(): + return {'error': 'PDF Not Found'}, 404 + if not Language.exists_lang_iso_target_source(source, target): + return {'error': 'Invalid Language code'}, 400 + obj_gid = obj.get_global_id() + if Language.exists_object_translation_language(obj_gid, target): + if force: + obj.delete_translated(target) + else: + return {'error': 'Already Translated'}, 400 + task_uuid = Language.create_translation_task(obj_gid, source, target, force=force) + return task_uuid, 200 + +def api_get_translations_progress(obj_id): + obj = PDF(obj_id) + if not obj.exists(): + return {'error': 'PDF Not Found'}, 404 + return Language.api_get_object_translation_tasks_progress(Language.get_object_tasks_uuid(obj.get_global_id())) + +class PDFs(AbstractDaterangeObjects): + """ + PDF Objects + """ + def __init__(self): + super().__init__('pdf', PDF) + + def get_name(self): + return 'PDFS' + + def get_icon(self): + return {'fas': 'far', 'icon': 'file-pdf'} + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('objects_pdf.objects_pdfs') + else: + url = f'{baseurl}/objects/pdfs' + return url + + def sanitize_id_to_search(self, name_to_search): + return name_to_search # TODO + + +# if __name__ == '__main__': +# pdf.translate() +# print(time.time() - t) diff --git a/bin/lib/objects/Screenshots.py b/bin/lib/objects/Screenshots.py index 0135bb251..2c967b969 100755 --- a/bin/lib/objects/Screenshots.py +++ b/bin/lib/objects/Screenshots.py @@ -104,6 +104,7 @@ def get_description_models(self): if key.startswith('desc:'): model = key[5:] models.append(model) + return models def add_description_model(self, model, description): self._set_field(f'desc:{model}', description) diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py index 5c0c18e7b..f1a649a9b 100755 --- a/bin/lib/objects/abstract_object.py +++ b/bin/lib/objects/abstract_object.py @@ -89,40 +89,46 @@ def get_default_meta(self, tags=False, link=False, options=set()): dict_meta['uuid'] = str(uuid.uuid5(uuid.NAMESPACE_URL, self.get_id())) if 'custom' in options: dict_meta['custom'] = self.get_custom_meta() + if 'file-meta' in options: + dict_meta['file-meta'] = self.get_file_meta() + if 'investigations' in options: + dict_meta['investigations'] = self.get_investigations() + if 'svg_icon' in options: + dict_meta['svg_icon'] = self.get_svg_icon() return dict_meta def _get_obj_field(self, obj_type, subtype, obj_id, field): - if subtype is None: + if not subtype: return r_object.hget(f'meta:{obj_type}:{obj_id}', field) else: return r_object.hget(f'meta:{obj_type}:{subtype}:{obj_id}', field) def _exists_field(self, field): - if self.subtype is None: + if not self.subtype: return r_object.hexists(f'meta:{self.type}:{self.id}', field) else: return r_object.hexists(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field) def _get_field(self, field): - if self.subtype is None: + if not self.subtype: return r_object.hget(f'meta:{self.type}:{self.id}', field) else: return r_object.hget(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field) def _set_field(self, field, value): - if self.subtype is None: + if not self.subtype: return r_object.hset(f'meta:{self.type}:{self.id}', field, value) else: return r_object.hset(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field, value) def _get_fields_keys(self): - if self.subtype is None: + if not self.subtype: return r_object.hkeys(f'meta:{self.type}:{self.id}') else: return r_object.hkeys(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}') def _delete_field(self, field): - if self.subtype is None: + if not self.subtype: return r_object.hdel(f'meta:{self.type}:{self.id}', field) else: return r_object.hdel(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field) @@ -173,7 +179,7 @@ def get_content(self): ## Custom Metas ## def get_custom_meta(self): - custom_metas = r_object.hget(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', 'custom') + custom_metas = self._get_field('custom') return custom_metas # full_custom_meta: dictionary of custom meta to save @@ -191,10 +197,30 @@ def set_custom_meta(self, full_custom_meta=None, *custom_metas): full_custom_meta = json.dumps(full_custom_meta) except Exception as e: raise Exception(f'Invalid JSON/Dictionary {e}') - r_object.hset(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', 'custom', full_custom_meta) + self._set_field('custom', full_custom_meta) def delete_custom_meta(self): - r_object.hdel(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', 'custom') + self._delete_field('custom') + + ## File-Meta ## + + def get_file_meta(self): + file_meta = self._get_field('file-meta') + if file_meta: + return json.loads(file_meta) + return None + + def set_file_meta(self, file_meta): + try: + file_meta = json.dumps(file_meta) + except Exception as e: + raise Exception(f'Invalid JSON/Dictionary {e}') + self._set_field('file-meta', file_meta) + + def delete_file_meta(self): + self._delete_field('file-meta') + + ## -File-Meta- ## ## Duplicates ## def get_duplicates(self): diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py index 40166118f..ed6a7e735 100755 --- a/bin/lib/objects/ail_objects.py +++ b/bin/lib/objects/ail_objects.py @@ -19,6 +19,7 @@ from lib import chats_viewer +from lib.objects import Authors from lib.objects import BarCodes from lib.objects import Chats from lib.objects import ChatSubChannels @@ -40,6 +41,7 @@ from lib.objects import Mails from lib.objects import Messages from lib.objects import Ocrs +from lib.objects import PDFs from lib.objects import Pgps from lib.objects import QrCodes from lib.objects import Screenshots @@ -53,6 +55,7 @@ # config_loader = None # TODO INIT objs classes ???? OBJECTS_CLASS = { + 'author': {'obj': Authors.Author, 'objs': Authors.Authors}, 'barcode': {'obj': BarCodes.Barcode, 'objs': BarCodes.Barcodes}, 'chat': {'obj': Chats.Chat, 'objs': Chats.Chats}, 'chat-subchannel': {'obj': ChatSubChannels.ChatSubChannel, 'objs': None}, ###### ###### @@ -74,6 +77,7 @@ 'mail': {'obj': Mails.Mail, 'objs': Mails.Mails}, 'message': {'obj': Messages.Message, 'objs': None}, ############################################################# 'ocr': {'obj': Ocrs.Ocr, 'objs': Ocrs.Ocrs}, + 'pdf': {'obj': PDFs.PDF, 'objs': PDFs.PDFs}, 'pgp': {'obj': Pgps.Pgp, 'objs': Pgps.Pgps}, 'qrcode': {'obj': QrCodes.Qrcode, 'objs': QrCodes.Qrcodes}, 'screenshot': {'obj': Screenshots.Screenshot, 'objs': None}, #################################################################################################### @@ -316,6 +320,10 @@ def get_object_card_meta(obj_type, subtype, id, related_btc=False): meta['size'] = obj.get_size() meta["vt"] = obj.get_meta_vt() meta["vt"]["status"] = obj.is_vt_enabled() + if obj.get_type() == 'pdf': + meta['author'] = obj.get_author() + meta["file-names"] = obj.get_file_names() + meta["markdown_id"] = obj.get_markdown_id() # TAGS MODAL meta["add_tags_modal"] = Tag.get_modal_add_tags(obj.id, obj.get_type(), obj.get_subtype(r_str=True)) return meta diff --git a/bin/modules/CodeReader.py b/bin/modules/CodeReader.py index e07fe1ab2..cf83111ae 100755 --- a/bin/modules/CodeReader.py +++ b/bin/modules/CodeReader.py @@ -139,7 +139,7 @@ def compute(self, message): if obj.type == 'image': if self.obj.is_gif(): - self.logger.warning(f'Ignoring GIF: {self.obj.id}') + self.logger.info(f'Ignoring GIF: {self.obj.id}') return None # image - screenshot diff --git a/bin/modules/Global.py b/bin/modules/Global.py index 447b2e51e..2ef1973fd 100755 --- a/bin/modules/Global.py +++ b/bin/modules/Global.py @@ -37,7 +37,7 @@ # Import Project packages ################################## from modules.abstract_module import AbstractModule -from lib.ail_core import get_objects_tracked +from lib.ail_core import is_tracked_object from lib.ConfigLoader import ConfigLoader from lib.data_retention_engine import update_obj_date from lib.objects.Items import Item @@ -131,18 +131,23 @@ def compute(self, message, r_result=False): # TODO move OBJ ID sanitization to i else: self.logger.info(f"Empty Item: {message} not processed") - elif self.obj.type == 'message' or self.obj.type == 'ocr': + elif self.obj.type == 'message' or self.obj.type == 'ocr': # TODO TO Configure in ail_core self.add_message_to_queue(obj=self.obj, queue='Item') elif self.obj.type == 'image': self.add_message_to_queue(obj=self.obj, queue='Image', message=message) self.add_message_to_queue(obj=self.obj, queue='Images', message=message) elif self.obj.type == 'title': self.add_message_to_queue(obj=self.obj, queue='Titles', message=message) + elif self.obj.type == 'file-name': + pass + elif self.obj.type == 'pdf': + return None else: self.logger.critical(f"Empty obj: {self.obj} {message} not processed") + return None # Trackers - if self.obj.type in get_objects_tracked(): + if is_tracked_object(self.obj.type): self.add_message_to_queue(obj=self.obj, queue='Trackers') def check_filename(self, filename, new_file_content): diff --git a/bin/modules/Translation.py b/bin/modules/Translation.py new file mode 100755 index 000000000..b29748033 --- /dev/null +++ b/bin/modules/Translation.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import time + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib.objects import PDFs +from lib import Language +# from lib.ConfigLoader import ConfigLoader + +class Translation(AbstractModule): + """ + Translation module for AIL framework + """ + + def __init__(self): + super(Translation, self).__init__() + + # Send module state to logs + self.logger.info(f'Module {self.module_name} initialized') + + self.refresh_time = 0 + + # ttl translated + def refresh(self): + PDFs.ttl_translated_pdfs() + self.refresh_time = int(time.time()) + + def compute(self, task_uuid): + print(f'Launch translation: {task_uuid}') + task = Language.TranslationTask(task_uuid) + obj_id = task.get_object().split(':', 2)[-1] + self.obj = PDFs.PDF(obj_id) + + task.start() + target = task.get_target() + self.obj.translate(task, task.get_source(), task.get_target()) + print(f'Translated PDF {target}: {obj_id}') + + def run(self): + """ + Run Module endless process + """ + # Endless loop processing messages from the input queue + while self.proceed: + # if self.refresh_time < 86400: + # self.refresh() + if Language.ping_libretranslate(): + task_uuid = Language.get_translation_task_to_launch() + if task_uuid: + # Module processing with the message from the queue + self.logger.debug(task_uuid) + # try: + self.compute(task_uuid) + # except Exception as err: + # self.logger.error(f'Error in module {self.module_name}: {err}') + # # Remove uuid ref + # self.remove_submit_uuid(uuid) + else: + # Wait before next process + time.sleep(self.pending_seconds) + else: + time.sleep(10) + + +if __name__ == '__main__': + module = Translation() + module.run() diff --git a/bin/modules/abstract_module.py b/bin/modules/abstract_module.py index 7478cb878..f8b235950 100644 --- a/bin/modules/abstract_module.py +++ b/bin/modules/abstract_module.py @@ -171,12 +171,12 @@ def run(self): # LOG ERROR trace = traceback.format_tb(err.__traceback__) trace = ''.join(trace) - self.logger.critical(f"Error in module {self.module_name}: {__name__} : {err}") if message: self.logger.critical(f"Module {self.module_name} input message: {message}") if self.obj: self.logger.critical(f"{self.module_name} Obj: {self.obj.get_global_id()}") self.logger.critical(trace) + self.logger.critical(f"Error in module {self.module_name}: {__name__} : {err}") if isinstance(err, ModuleQueueError): self.queue.error() diff --git a/bin/packages/Date.py b/bin/packages/Date.py index d49b6c375..5a05db46f 100644 --- a/bin/packages/Date.py +++ b/bin/packages/Date.py @@ -105,6 +105,9 @@ def get_current_utc_full_time(): timestamp = datetime.datetime.fromtimestamp(time.time()) return timestamp.strftime('%Y-%m-%d %H:%M:%S') +def get_date_from_timestamp(timestamp): + return datetime.datetime.fromtimestamp(timestamp).strftime('%Y%m%d') + def get_month_dates(date=None): if date: date = convert_date_str_to_datetime(date) diff --git a/bin/trackers/Tracker_Regex.py b/bin/trackers/Tracker_Regex.py index acb20a9ec..3c4f71b51 100755 --- a/bin/trackers/Tracker_Regex.py +++ b/bin/trackers/Tracker_Regex.py @@ -70,11 +70,11 @@ def compute(self, message): return None content = obj.get_content() - - for dict_regex in self.tracked_regexs[obj_type]: - matches = self.regex_finditer(dict_regex['regex'], obj_id, content) - if matches: - self.new_tracker_found(dict_regex['tracked'], 'regex', obj, matches) + if content: + for dict_regex in self.tracked_regexs[obj_type]: + matches = self.regex_finditer(dict_regex['regex'], obj_id, content) + if matches: + self.new_tracker_found(dict_regex['tracked'], 'regex', obj, matches) def extract_matches(self, re_matches, limit=500, lines=5): matches = [] diff --git a/bin/trackers/Tracker_Term.py b/bin/trackers/Tracker_Term.py index 3a3172fee..4323ad2bd 100755 --- a/bin/trackers/Tracker_Term.py +++ b/bin/trackers/Tracker_Term.py @@ -89,32 +89,32 @@ def compute(self, message): return None content = obj.get_content() + if content: + signal.alarm(self.max_execution_time) - signal.alarm(self.max_execution_time) + dict_words_freq = None + try: + dict_words_freq = Tracker.get_text_word_frequency(content) + except TimeoutException: + self.logger.warning(f"{self.obj.get_global_id()} processing timeout") + else: + signal.alarm(0) - dict_words_freq = None - try: - dict_words_freq = Tracker.get_text_word_frequency(content) - except TimeoutException: - self.logger.warning(f"{self.obj.get_global_id()} processing timeout") - else: - signal.alarm(0) + if dict_words_freq: - if dict_words_freq: - - # check solo words - for word in self.tracked_words[obj_type]: - if word in dict_words_freq: - self.new_tracker_found(word, 'word', obj) - - # check words set - for tracked_set in self.tracked_sets[obj_type]: - nb_uniq_word = 0 - for word in tracked_set['words']: + # check solo words + for word in self.tracked_words[obj_type]: if word in dict_words_freq: - nb_uniq_word += 1 - if nb_uniq_word >= tracked_set['nb']: - self.new_tracker_found(tracked_set['tracked'], 'set', obj) + self.new_tracker_found(word, 'word', obj) + + # check words set + for tracked_set in self.tracked_sets[obj_type]: + nb_uniq_word = 0 + for word in tracked_set['words']: + if word in dict_words_freq: + nb_uniq_word += 1 + if nb_uniq_word >= tracked_set['nb']: + self.new_tracker_found(tracked_set['tracked'], 'set', obj) def new_tracker_found(self, tracker_name, tracker_type, obj): # TODO FILTER obj_id = obj.get_id() diff --git a/configs/core.cfg.sample b/configs/core.cfg.sample index 644876309..3cfc65fad 100644 --- a/configs/core.cfg.sample +++ b/configs/core.cfg.sample @@ -5,7 +5,13 @@ pastes = PASTES hash = HASHS crawled = crawled har = CRAWLED_SCREENSHOT +files = FILES +# size in bytes +max_pdf_size = 100000000 +translated_pdf = temp/pdf +pdf_translation_ttl = 30 screenshot = CRAWLED_SCREENSHOT/screenshot +cookiejar_local_storage = CRAWLED_SCREENSHOT/cookiejar_local_storage images = IMAGES favicons = FAVICONS diff --git a/configs/modules.cfg b/configs/modules.cfg index 6aa6eb7dc..c936adf2f 100644 --- a/configs/modules.cfg +++ b/configs/modules.cfg @@ -193,6 +193,9 @@ publish = Tag_feed [Retro_Hunt_Module] publish = Tags +[Translation] +publish = Tags + ######## OTHER ######## [D4Client] diff --git a/installing_deps.sh b/installing_deps.sh index 6a705c522..741ade335 100755 --- a/installing_deps.sh +++ b/installing_deps.sh @@ -127,20 +127,32 @@ DEFAULT_HOME=$(pwd) #### KVROCKS #### if [ -z "$SKIP_KVROCKS" ]; then echo "--- Building Kvrocks ---" - # If we are on debian, we can get the kvrocks deb package: - # download the right version from https://github.com/RocksLabs/kvrocks-fpm/releases - # then sudo dpkg -i kvrocks_2.11.1-1_amd64.deb (change the version number to yours) - - test ! -d kvrocks/ && git clone https://github.com/apache/incubator-kvrocks.git kvrocks - pushd kvrocks - # Build Kvrocks in portable mode - #export PORTABLE=1 - ./x.py build -j 4 - popd + + # Check if we're on a Debian-based system or in GitHub Actions + USE_DEB=false + if [ -f /etc/debian_version ] || [ ! -z "$GITHUB_ACTIONS" ]; then + # Additional check: verify dpkg is available + if command -v dpkg >/dev/null 2>&1; then + USE_DEB=true + fi + fi + + if [ "$USE_DEB" = true ]; then + echo "Debian-based system detected, installing from .deb package" + wget -O /tmp/kvr.deb https://github.com/RocksLabs/kvrocks-fpm/releases/download/2.14.0-1/kvrocks_2.14.0-1_amd64.deb + sudo dpkg -i /tmp/kvr.deb && rm /tmp/kvr.deb + else + echo "Non-Debian system detected, compiling from source" + test ! -d kvrocks/ && git clone https://github.com/apache/incubator-kvrocks.git kvrocks + pushd kvrocks + # Build Kvrocks in portable mode + #export PORTABLE=1 + ./x.py build -j 4 + popd + fi DEFAULT_KVROCKS_DATA=$DEFAULT_HOME/DATA_KVROCKS mkdir -p $DEFAULT_KVROCKS_DATA - sed -i "s|dir /tmp/kvrocks|dir ${DEFAULT_KVROCKS_DATA}|1" $DEFAULT_HOME/configs/6383.conf ##-- KVROCKS --## else @@ -207,4 +219,4 @@ if [ -z "$SKIP_DB_SETUP" ]; then echo "" else echo "--- Skipping database setup ---" -fi \ No newline at end of file +fi diff --git a/requirements.txt b/requirements.txt index e6d6730f0..2727f95d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,6 +35,10 @@ textblob>=0.15.3 html2text>=2020.1.16 beautifulsoup4>4.8.2 +# PDF +pymupdf +pymupdf4llm + # Crawler scrapy>2.0.0 scrapy-splash>=0.7.2 diff --git a/tests/test_api_crawler.py b/tests/test_api_crawler.py new file mode 100644 index 000000000..87072e1a6 --- /dev/null +++ b/tests/test_api_crawler.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Test suite for AIL Crawler API endpoints. + +This module tests the following endpoints: +- POST /api/v1/add/crawler/task - Add a new crawler task +- POST /api/v1/add/crawler/capture - Add a crawler capture + +All endpoints require authentication and proper user role. +""" + +import os +import sys +import unittest +import json +from unittest.mock import patch, MagicMock + +sys.path.append(os.environ['AIL_BIN']) +sys.path.append(os.environ['AIL_FLASK']) +################################## +# Import Project packages +################################## +from lib import ail_users +from lib import ail_logger +from lib.ConfigLoader import ConfigLoader + +# Import Flask app for testing - need to import Flask_server to initialize app +sys.path.append(os.environ['AIL_FLASK']) +import Flask_server # This initializes Flask_config.app +from Flask_config import app + +test_logger = ail_logger.get_test_config(create=True) + + +class TestApiCrawler(unittest.TestCase): + """ + Test suite for Crawler API endpoints. + + Tests cover authentication, authorization, input validation, + and error handling for crawler task and capture endpoints. + """ + + def setUp(self): + """ + Set up test client and test data. + + Initializes Flask test client, test token, and standard test data. + Skips tests if Flask app is not initialized. + """ + if app is None: + raise unittest.SkipTest("Flask app not initialized") + self.app = app + self.client = app.test_client() + + # Get a valid test token + try: + self.test_token = ail_users.get_user_token('admin@admin.test') + except Exception as e: + test_logger.warning(f'Could not get test token: {e}') + self.test_token = 'test_token_for_testing' + + # Standard test data + self.test_url = 'http://test-example.onion' + self.test_data = { + 'url': self.test_url + } + + def _make_authenticated_request(self, method, endpoint, data=None, token=None): + """ + Helper method to make authenticated API requests. + + Args: + method: HTTP method (e.g., 'POST', 'GET') + endpoint: API endpoint path (e.g., '/api/v1/add/crawler/task') + data: Optional dictionary to send as JSON body + token: Optional auth token (defaults to self.test_token) + + Returns: + Flask test client response object + """ + headers = { + 'Authorization': token or self.test_token, + 'Content-Type': 'application/json' + } + if data: + return self.client.open( + endpoint, + method=method, + data=json.dumps(data), + headers=headers + ) + else: + return self.client.open( + endpoint, + method=method, + headers=headers + ) + + # ==================== POST /api/v1/add/crawler/task ==================== + + @patch('blueprints.api_rest.ail_api.authenticate_user') + @patch('blueprints.api_rest.ail_api.is_user_in_role') + @patch('blueprints.api_rest.ail_api.get_basic_user_meta') + @patch('blueprints.api_rest.crawlers.api_add_crawler_task') + def test_add_crawler_task_success(self, mock_add_task, mock_get_meta, mock_is_role, mock_auth): + """ + Test successful crawler task addition. + + Verifies that a valid request with proper authentication + successfully adds a crawler task and returns 200 with the URL. + """ + # Mock authentication and authorization + mock_auth.return_value = ({'status': 'success'}, 200) + mock_is_role.return_value = True + mock_get_meta.return_value = ('test_org', 'test_user_id', 'user') + + # Mock crawler task addition returning None (success path) + mock_add_task.return_value = None + + response = self._make_authenticated_request('POST', '/api/v1/add/crawler/task', self.test_data) + + # Assertions + self.assertEqual(response.status_code, 200, "Should return 200 on success") + response_data = json.loads(response.data.decode()) + self.assertEqual(response_data.get('url'), self.test_url, "Response should contain the submitted URL") + self.assertIsInstance(response_data, dict, "Response should be a JSON object") + mock_add_task.assert_called_once() + + # Verify the call was made with correct arguments + call_args = mock_add_task.call_args + self.assertEqual(call_args[0][0], self.test_data, "Should pass request data to crawler") + self.assertEqual(call_args[0][1], 'test_org', "Should pass user org to crawler") + self.assertEqual(call_args.kwargs["user_id"], 'test_user_id', "Should pass user_id to crawler") + + @patch('blueprints.api_rest.ail_api.authenticate_user') + @patch('blueprints.api_rest.ail_api.is_user_in_role') + @patch('blueprints.api_rest.ail_api.get_basic_user_meta') + @patch('blueprints.api_rest.crawlers.api_add_crawler_task') + def test_add_crawler_task_error_from_crawler(self, mock_add_task, mock_get_meta, mock_is_role, mock_auth): + """ + Test crawler task addition when crawler returns an error. + + Verifies that errors from the crawler module are properly + propagated back to the client with appropriate status code. + """ + # Mock authentication and authorization + mock_auth.return_value = ({'status': 'success'}, 200) + mock_is_role.return_value = True + mock_get_meta.return_value = ('test_org', 'test_user_id', 'user') + + # Mock crawler task addition returning an error + error_message = 'Crawler error: Invalid URL format' + mock_add_task.return_value = ({'status': 'error', 'reason': error_message}, 400) + + response = self._make_authenticated_request('POST', '/api/v1/add/crawler/task', self.test_data) + + # Assertions + self.assertEqual(response.status_code, 400, "Should return 400 when crawler returns error") + response_data = json.loads(response.data.decode()) + self.assertEqual(response_data.get('status'), 'error', "Response should indicate error status") + self.assertEqual(response_data.get('reason'), error_message, "Response should include error reason") + mock_add_task.assert_called_once() + + def test_add_crawler_task_missing_auth(self): + """ + Test crawler task addition without authentication header. + + Verifies that requests without Authorization header are rejected + with 401 Unauthorized status. + """ + response = self.client.post( + '/api/v1/add/crawler/task', + data=json.dumps(self.test_data), + content_type='application/json' + ) + + # Assertions + self.assertEqual(response.status_code, 401, "Should return 401 for missing authentication") + response_data = json.loads(response.data.decode()) + self.assertEqual(response_data.get('status'), 'error', "Response should indicate error status") + self.assertIn('Authentication', response_data.get('reason', ''), "Error message should mention authentication") + + def test_add_crawler_task_missing_url(self): + """ + Test crawler task addition with missing URL field. + + Verifies that requests without required 'url' field are handled appropriately. + """ + with patch('blueprints.api_rest.ail_api.authenticate_user') as mock_auth, \ + patch('blueprints.api_rest.ail_api.is_user_in_role') as mock_is_role, \ + patch('blueprints.api_rest.ail_api.get_basic_user_meta') as mock_get_meta: + + # Mock authentication + mock_auth.return_value = ({'status': 'success'}, 200) + mock_is_role.return_value = True + mock_get_meta.return_value = ('test_org', 'test_user_id', 'user') + + # Request without URL + invalid_data = {} + response = self._make_authenticated_request('POST', '/api/v1/add/crawler/task', invalid_data) + + # Should either return 400 (validation error) or pass to crawler which will handle it + # The actual behavior depends on how the endpoint validates input + self.assertIn(response.status_code, [200, 400, 500], "Should return appropriate error status") + + def test_add_crawler_task_invalid_json(self): + """ + Test crawler task addition with invalid JSON. + + Verifies that malformed JSON requests are rejected. + """ + headers = { + 'Authorization': self.test_token, + 'Content-Type': 'application/json' + } + + response = self.client.post( + '/api/v1/add/crawler/task', + data='{"url": "test", invalid json}', + headers=headers + ) + + # Should return 400 Bad Request for invalid JSON + self.assertIn(response.status_code, [400, 415], "Should return error for invalid JSON") + + @patch('blueprints.api_rest.ail_api.authenticate_user') + def test_add_crawler_task_invalid_token(self, mock_auth): + """ + Test crawler task addition with invalid token. + + Verifies that requests with invalid authentication tokens + are rejected with 401 Unauthorized status. + """ + # Mock authentication failure + mock_auth.return_value = ({'status': 'error', 'reason': 'Invalid token'}, 401) + + response = self._make_authenticated_request('POST', '/api/v1/add/crawler/task', self.test_data, token='invalid_token') + + # Assertions + self.assertEqual(response.status_code, 401, "Should return 401 for invalid token") + response_data = json.loads(response.data.decode()) + self.assertEqual(response_data.get('status'), 'error', "Response should indicate error status") + self.assertIn('token', response_data.get('reason', '').lower(), "Error message should mention token") + + @patch('blueprints.api_rest.ail_api.authenticate_user') + @patch('blueprints.api_rest.ail_api.is_user_in_role') + def test_add_crawler_task_wrong_role(self, mock_is_role, mock_auth): + """ + Test crawler task addition with insufficient user role. + + Verifies that authenticated users without required role ('user') + are rejected with 403 Forbidden status. + """ + # Mock authentication success but insufficient role + mock_auth.return_value = ({'status': 'success'}, 200) + mock_is_role.return_value = False # User not in 'user' role + + response = self._make_authenticated_request('POST', '/api/v1/add/crawler/task', self.test_data) + + # Assertions + self.assertEqual(response.status_code, 403, "Should return 403 for insufficient permissions") + response_data = json.loads(response.data.decode()) + self.assertEqual(response_data.get('status'), 'error', "Response should indicate error status") + self.assertIn('Forbidden', response_data.get('reason', ''), "Error message should mention access forbidden") + + # ==================== POST /api/v1/add/crawler/capture ==================== + + @patch('blueprints.api_rest.ail_api.authenticate_user') + @patch('blueprints.api_rest.ail_api.is_user_in_role') + @patch('blueprints.api_rest.ail_api.get_user_from_token') + @patch('blueprints.api_rest.crawlers.api_add_crawler_capture') + def test_add_crawler_capture_success(self, mock_add_capture, mock_get_user, mock_is_role, mock_auth): + """ + Test successful crawler capture addition. + + Verifies that a valid request with proper authentication + successfully adds a crawler capture and returns 200 with the URL. + """ + # Mock authentication and authorization + mock_auth.return_value = ({'status': 'success'}, 200) + mock_is_role.return_value = True + mock_get_user.return_value = 'test_user_id' + + # Mock crawler capture addition returning None (success path) + mock_add_capture.return_value = None + + response = self._make_authenticated_request('POST', '/api/v1/add/crawler/capture', self.test_data) + + # Assertions + self.assertEqual(response.status_code, 200, "Should return 200 on success") + response_data = json.loads(response.data.decode()) + self.assertEqual(response_data.get('url'), self.test_url, "Response should contain the submitted URL") + self.assertIsInstance(response_data, dict, "Response should be a JSON object") + mock_add_capture.assert_called_once() + + # Verify the call was made with correct arguments + call_args = mock_add_capture.call_args + self.assertEqual(call_args[0][0], self.test_data, "Should pass request data to crawler") + self.assertEqual(call_args[0][1], 'test_user_id', "Should pass user_id to crawler") + + @patch('blueprints.api_rest.ail_api.authenticate_user') + @patch('blueprints.api_rest.ail_api.is_user_in_role') + @patch('blueprints.api_rest.ail_api.get_user_from_token') + @patch('blueprints.api_rest.crawlers.api_add_crawler_capture') + def test_add_crawler_capture_error_from_crawler(self, mock_add_capture, mock_get_user, mock_is_role, mock_auth): + """ + Test crawler capture addition when crawler returns an error. + + Verifies that errors from the crawler module are properly + propagated back to the client with appropriate status code. + """ + # Mock authentication and authorization + mock_auth.return_value = ({'status': 'success'}, 200) + mock_is_role.return_value = True + mock_get_user.return_value = 'test_user_id' + + # Mock crawler capture addition returning an error + error_message = 'Capture error: Failed to process screenshot' + mock_add_capture.return_value = ({'status': 'error', 'reason': error_message}, 400) + + response = self._make_authenticated_request('POST', '/api/v1/add/crawler/capture', self.test_data) + + # Assertions + self.assertEqual(response.status_code, 400, "Should return 400 when crawler returns error") + response_data = json.loads(response.data.decode()) + self.assertEqual(response_data.get('status'), 'error', "Response should indicate error status") + self.assertEqual(response_data.get('reason'), error_message, "Response should include error reason") + mock_add_capture.assert_called_once() + + def test_add_crawler_capture_missing_auth(self): + """ + Test crawler capture addition without authentication header. + + Verifies that requests without Authorization header are rejected + with 401 Unauthorized status. + """ + response = self.client.post( + '/api/v1/add/crawler/capture', + data=json.dumps(self.test_data), + content_type='application/json' + ) + + # Assertions + self.assertEqual(response.status_code, 401, "Should return 401 for missing authentication") + response_data = json.loads(response.data.decode()) + self.assertEqual(response_data.get('status'), 'error', "Response should indicate error status") + self.assertIn('Authentication', response_data.get('reason', ''), "Error message should mention authentication") + + +if __name__ == "__main__": + unittest.main(exit=False) + diff --git a/update/v6.6/Update.py b/update/v6.6/Update.py new file mode 100755 index 000000000..51803450f --- /dev/null +++ b/update/v6.6/Update.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +sys.path.append(os.environ['AIL_HOME']) +################################## +# Import Project packages +################################## +from update.bin.ail_updater import AIL_Updater + +class Updater(AIL_Updater): + """default Updater.""" + + def __init__(self, version): + super(Updater, self).__init__(version) + + +if __name__ == '__main__': + updater = Updater('v6.6') + updater.run_update() diff --git a/update/v6.6/Update.sh b/update/v6.6/Update.sh new file mode 100755 index 000000000..1cc7307e9 --- /dev/null +++ b/update/v6.6/Update.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +[ -z "$AIL_HOME" ] && echo "Needs the env var AIL_HOME. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_REDIS" ] && echo "Needs the env var AIL_REDIS. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_BIN" ] && echo "Needs the env var AIL_ARDB. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_FLASK" ] && echo "Needs the env var AIL_FLASK. Run the script from the virtual environment." && exit 1; + +export PATH=$AIL_HOME:$PATH +export PATH=$AIL_REDIS:$PATH +export PATH=$AIL_BIN:$PATH +export PATH=$AIL_FLASK:$PATH + +GREEN="\\033[1;32m" +DEFAULT="\\033[0;39m" + +echo -e $GREEN"Shutting down AIL ..."$DEFAULT +bash ${AIL_BIN}/LAUNCH.sh -ks +wait + +# SUBMODULES # +git submodule update + +bash ${AIL_BIN}/LAUNCH.sh -lrv +bash ${AIL_BIN}/LAUNCH.sh -lkv + +echo -e $GREEN"Installing python pymupdf..."$DEFAULT +pip install -U pymupdf + +echo -e $GREEN"Installing python pymupdf4llm..."$DEFAULT +pip install -U pymupdf4llm + +echo -e $GREEN"Updating python Lexilang..."$DEFAULT +pip uninstall -y lexilang +pip install -U git+https://github.com/ail-project/LexiLang + +echo -e $GREEN"Updating python pyail."$DEFAULT +pip install -U pyail + +echo -e $GREEN"Updating python pylacus."$DEFAULT +pip install -U pylacus + +echo -e $GREEN"Installing pyfaup-rs."$DEFAULT +pip install -U pyfaup-rs + +echo "" +echo -e $GREEN"Updating AIL VERSION ..."$DEFAULT +echo "" +python ${AIL_HOME}/update/v6.6/Update.py +wait +echo "" +echo "" + +exit 0 diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 17632b095..77aae11d9 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -66,6 +66,8 @@ from blueprints.objects_qrcode import objects_qrcode from blueprints.objects_favicon import objects_favicon from blueprints.objects_file_name import objects_file_name +from blueprints.objects_pdf import objects_pdf +from blueprints.objects_author import objects_author from blueprints.objects_ssh import objects_ssh from blueprints.objects_ip import objects_ip from blueprints.api_rest import api_rest @@ -130,7 +132,7 @@ def filter(self, record): Flask_config.app = Flask(__name__, static_url_path=baseUrl+'/static/') app = Flask_config.app -app.config['MAX_CONTENT_LENGTH'] = 900 * 1024 * 1024 +app.config['MAX_CONTENT_LENGTH'] = 2000 * 1024 * 1024 # ========= BLUEPRINT =========# app.register_blueprint(root, url_prefix=baseUrl) @@ -164,6 +166,8 @@ def filter(self, record): app.register_blueprint(objects_qrcode, url_prefix=baseUrl) app.register_blueprint(objects_favicon, url_prefix=baseUrl) app.register_blueprint(objects_file_name, url_prefix=baseUrl) +app.register_blueprint(objects_pdf, url_prefix=baseUrl) +app.register_blueprint(objects_author, url_prefix=baseUrl) app.register_blueprint(objects_ssh, url_prefix=baseUrl) app.register_blueprint(objects_ip, url_prefix=baseUrl) app.register_blueprint(search_b, url_prefix=baseUrl) @@ -272,7 +276,10 @@ def _handle_client_error(e): return Response(json.dumps({"status": "error", "reason": "Server Error"}) + '\n', mimetype='application/json'), 500 else: if current_user: - flask_logger.warning(f'User: {current_user.get_user_id()}') + try: + flask_logger.warning(f'User: {current_user.get_user_id()}') + except AttributeError as e: + flask_logger.warning(f'Anonymous User error (AnonymousUserMixin, user not logged)') return e @login_required diff --git a/var/www/blueprints/api_rest.py b/var/www/blueprints/api_rest.py index c2dd3611c..e28788614 100644 --- a/var/www/blueprints/api_rest.py +++ b/var/www/blueprints/api_rest.py @@ -152,6 +152,17 @@ def get_onions_up_month(date_year_month): res = Domains.api_get_onions_by_month(date_year_month) return Response(json.dumps(res[0]), mimetype='application/json'), res[1] +@api_rest.route("api/v1/lacus/cookiejar/import", methods=['POST']) +@token_required('user') +def lacus_cookiejar_import(): + data = request.get_json() + user_token = get_auth_from_header() + user_org, user_id, _ = ail_api.get_basic_user_meta(user_token) + + res = crawlers.api_import_lacus_cookiejar(user_org, user_id, data) + return Response(json.dumps(res[0]), mimetype='application/json'), res[1] + + # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # IMPORTERS # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 1fb6650d6..623288f39 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -887,6 +887,20 @@ def crawler_cookiejar_cookie_delete(): cookiejar_uuid = res[0]['cookiejar_uuid'] return redirect(url_for('crawler_splash.crawler_cookiejar_show', uuid=cookiejar_uuid)) +@crawler_splash.route('/crawler/cookiejar/local_storage/delete', methods=['GET']) +@login_required +@login_user_no_api +def crawler_cookiejar_local_storage_delete(): + user_org = current_user.get_org() + user_id = current_user.get_user_id() + user_role = current_user.get_role() + cookiejar_uuid = request.args.get('uuid') + + res = crawlers.api_delete_cookiejar_local_storage(user_org, user_id, user_role, cookiejar_uuid) + if res[1] != 200: + return create_json_response(res[0], res[1]) + return redirect(url_for('crawler_splash.crawler_cookiejar_show', uuid=cookiejar_uuid)) + @crawler_splash.route('/crawler/cookiejar/delete', methods=['GET']) @login_required @@ -983,7 +997,7 @@ def crawler_cookiejar_cookie_add(): def crawler_cookiejar_cookie_manual_add_post(): user_org = current_user.get_org() user_id = current_user.get_user_id() - is_admin = current_user.is_admin() + user_role = current_user.get_role() cookiejar_uuid = request.form.get('cookiejar_uuid') name = request.form.get('name') value = request.form.get('value') @@ -1002,7 +1016,7 @@ def crawler_cookiejar_cookie_manual_add_post(): if secure: cookie_dict['secure'] = True - res = crawlers.api_create_cookie(user_org, user_id, is_admin, cookiejar_uuid, cookie_dict) + res = crawlers.api_create_cookie(user_org, user_id, user_role, cookiejar_uuid, cookie_dict) if res[1] != 200: return create_json_response(res[0], res[1]) @@ -1015,20 +1029,20 @@ def crawler_cookiejar_cookie_manual_add_post(): def crawler_cookiejar_cookie_json_add_post(): user_org = current_user.get_org() user_id = current_user.get_user_id() - is_admin = current_user.is_admin() + user_role = current_user.get_role() cookiejar_uuid = request.form.get('cookiejar_uuid') if 'file' in request.files: file = request.files['file'] json_cookies = file.read().decode() if json_cookies: - res = crawlers.api_import_cookies_from_json(user_org, user_id, is_admin, cookiejar_uuid, json_cookies) + res = crawlers.api_import_cookies_from_json(user_org, user_id, user_role, cookiejar_uuid, json_cookies) if res[1] != 200: return create_json_response(res[0], res[1]) return redirect(url_for('crawler_splash.crawler_cookiejar_show', cookiejar_uuid=cookiejar_uuid)) - return redirect(url_for('crawler_splash.crawler_cookiejar_cookie_add', cookiejar_uuid=cookiejar_uuid)) + return redirect(url_for('crawler_splash.crawler_cookiejar_cookie_add', uuid=cookiejar_uuid)) # --- Cookiejar ---# diff --git a/var/www/blueprints/hunters.py b/var/www/blueprints/hunters.py index 6753d1272..2dcb83e19 100644 --- a/var/www/blueprints/hunters.py +++ b/var/www/blueprints/hunters.py @@ -172,12 +172,21 @@ def show_tracker(): new_filter = request.form.get(f'{obj_type}_obj') if new_filter: filter_obj_types.append(obj_type) - if sorted(filter_obj_types) == list(Tracker.get_objects_tracked()): + filter_obj_types = ail_core.sanitize_tracked_objects(filter_obj_types) + if len(filter_obj_types) == ail_core.get_nb_objects_tracked(): filter_obj_types = [] - else: - tracker_uuid = request.args.get('uuid', None) - date_from = request.args.get('date_from') - date_to = request.args.get('date_to') + filter_obj_types = ','.join(filter_obj_types) + if filter_obj_types: + return redirect(url_for('hunters.show_tracker', uuid=tracker_uuid, date_from=date_from, date_to=date_to, filter=filter_obj_types)) + else: + return redirect(url_for('hunters.show_tracker', uuid=tracker_uuid, date_from=date_from, date_to=date_to)) + + tracker_uuid = request.args.get('uuid', None) + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + filter_obj_types = ail_core.sanitize_tracked_objects(request.args.get('filter', '').split(',')) + if len(filter_obj_types) == ail_core.get_nb_objects_tracked(): + filter_obj_types = [] res = Tracker.api_check_tracker_acl(tracker_uuid, user_org, user_id, user_role, 'view') if res: # invalid access @@ -207,7 +216,7 @@ def show_tracker(): if date_from: date_from, date_to = Date.sanitise_daterange(date_from, date_to) objs = tracker.get_objs_by_daterange(date_from, date_to, filter_obj_types) - meta['objs'] = ail_objects.get_objects_meta(objs, options={'last_full_date'}, flask_context=True) + meta['objs'] = ail_objects.get_objects_meta(objs, options={'last_full_date', 'pdf'}, flask_context=True) else: date_from = '' date_to = '' diff --git a/var/www/blueprints/objects_author.py b/var/www/blueprints/objects_author.py new file mode 100644 index 000000000..830103da3 --- /dev/null +++ b/var/www/blueprints/objects_author.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +''' + Blueprint Flask: crawler splash endpoints: dashboard, onion crawler ... +''' + +import json +import os +import sys + +from io import BytesIO + +from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort, send_file, send_from_directory +from flask_login import login_required + +# Import Role_Manager +from Role_Manager import login_admin, login_read_only, no_cache + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib import Language +from lib import Tag +from lib.objects import Authors + +from packages import Date + +# ============ BLUEPRINT ============ +objects_author = Blueprint('objects_author', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/author')) + +# ============ VARIABLES ============ +bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] + +def create_json_response(data, status_code): + return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code + +# ============ FUNCTIONS ============ + +@objects_author.route("/objects/authors", methods=['GET']) +@login_required +@login_read_only +def objects_authors(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + show_objects = request.args.get('show_objects') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + + if show_objects: + dict_objects = Authors.Authors().api_get_meta_by_daterange(date_from, date_to) + else: + dict_objects = {} + + return render_template("AuthorDaterange.html", date_from=date_from, date_to=date_to, + dict_objects=dict_objects, show_objects=show_objects) + + +@objects_author.route("/objects/authors/post", methods=['POST']) +@login_required +@login_read_only +def objects_authors_post(): + date_from = request.form.get('date_from') + date_to = request.form.get('date_to') + show_objects = request.form.get('show_objects') + return redirect(url_for('objects_author.objects_authors', date_from=date_from, date_to=date_to, show_objects=show_objects)) + + +@objects_author.route("/objects/authors/range/json", methods=['GET']) +@login_required +@login_read_only +def objects_authors_range_json(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + return jsonify(Authors.Authors().api_get_chart_nb_by_daterange(date_from, date_to)) + + +@objects_author.route("/objects/author", methods=['GET']) +@login_required +@login_read_only +def object_author(): + obj_id = request.args.get('id') + meta = Authors.api_get_author(obj_id) + if meta[1] != 200: + return create_json_response(meta[0], meta[1]) + else: + meta = meta[0] + return render_template("ShowAuthor.html", meta=meta, + bootstrap_label=bootstrap_label, + ail_tags=Tag.get_modal_add_tags(meta['id'], meta['type'], meta['subtype'])) + + +# ============= ROUTES ============== + diff --git a/var/www/blueprints/objects_item.py b/var/www/blueprints/objects_item.py index 3a9884073..534ca7174 100644 --- a/var/www/blueprints/objects_item.py +++ b/var/www/blueprints/objects_item.py @@ -70,6 +70,8 @@ def screenshot(filename): def showItem(): # # TODO: support post user_org = current_user.get_org() item_id = request.args.get('id') + match_uuid = request.args.get('match_uuid') + if not item_id or not item_basic.exist_item(item_id): abort(404) @@ -112,7 +114,7 @@ def showItem(): # # TODO: support post else: meta['investigations'] = [] - extracted = module_extractor.extract(current_user.get_user_id(), 'item', '', item.id, content=meta['content']) + extracted = module_extractor.extract(current_user.get_user_id(), 'item', '', item.id, content=meta['content'], match_uuid=match_uuid) extracted_matches = module_extractor.get_extracted_by_match(extracted) return render_template("show_item.html", bootstrap_label=bootstrap_label, diff --git a/var/www/blueprints/objects_pdf.py b/var/www/blueprints/objects_pdf.py new file mode 100644 index 000000000..2b9b12b7c --- /dev/null +++ b/var/www/blueprints/objects_pdf.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +""" + Blueprint PDF +""" + +# import io +import os +import sys +import json + +from flask import render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort, send_file, send_from_directory +from flask_login import login_required + +# Import Role_Manager +from Role_Manager import login_admin, login_read_only, no_cache + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects import PDFs +from lib import Language +from lib import Tag +from packages import Date + +# ============ BLUEPRINT ============ +objects_pdf = Blueprint('objects_pdf', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/pdf')) + +# ============ VARIABLES ============ +bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] + +# ============ FUNCTIONS ============ +def create_json_response(data, status_code): + return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code + +# ============= ROUTES ============== +@objects_pdf.route('/pdf/pdfa/') +@login_required +@login_read_only +@no_cache +def pdf_pdfa(pdf_id): + if not pdf_id: + abort(404) + if not 64 <= len(pdf_id): + abort(404) + pdf_id = pdf_id.replace('/', '') + pdf = PDFs.PDF(pdf_id) + return send_from_directory(PDFs.PDF_FOLDER, pdf.get_rel_path(), as_attachment=False, mimetype='pdf', download_name=f'{pdf_id}.pdf') + +@objects_pdf.route('/pdf/translate', methods=['POST']) +@login_required +@login_read_only +@no_cache +def pdf_translate(): + obj_id = request.form.get('id') + source = request.form.get('source') + target = request.form.get('target') + r = PDFs.api_create_translation_task(obj_id, source, target, force=True) + if r[1] != 200: + return create_json_response(r[0], r[1]) + else: + if request.referrer: + return redirect(request.referrer) + else: + return create_json_response({'error': 'No Referrer'}, 400) + # return redirect(url_for('objects_pdf.pdf_view', id=obj_id, task_uuid=r[0])) + # return send_file(io.BytesIO(r[0]), as_attachment=True, download_name=f'{obj_id}.pdf') + +@objects_pdf.route("/pdf/translated", methods=['GET']) +@login_required +@login_read_only +def pdf_translated(): + filename = request.args.get('filename') + r = PDFs.api_exists_translation_file(filename) + if r[1] != 200: + return create_json_response(r[0], r[1]) + return send_from_directory(PDFs.PDF_TRANSLATED_DIR, r[0], as_attachment=False, mimetype='pdf') + +@objects_pdf.route("/pdf/translate/task/delete", methods=['GET']) +@login_required +@login_read_only +def pdf_translate_task_delete(): + task_uuid = request.args.get('uuid') + r = Language.api_delete_translation_task(task_uuid) + if r[1] != 200: + return create_json_response(r[0], r[1]) + if request.referrer: + return redirect(request.referrer) + else: + return create_json_response({'error': 'No Referrer'}, 400) + +@objects_pdf.route("/pdf/translated/progress", methods=['GET']) +@login_required +@login_read_only +def pdf_translated_progress(): + obj_id = request.args.get('id') + r = PDFs.api_get_translations_progress(obj_id) + return create_json_response(r[0], r[1]) + +@objects_pdf.route("/pdf/view", methods=['GET']) +@login_required +@login_read_only +def pdf_view(): + obj_id = request.args.get('id') + r = PDFs.api_get_meta(obj_id, options={'author', 'file-meta', 'file-names', 'markdown_id', 'translated', 'svg_icon'}, flask_context=True) + if r[1] != 200: + return create_json_response(r[0], r[1]) + meta = r[0] + return render_template("ShowPDF.html", + ail_tags=Tag.get_modal_add_tags(meta['id'], object_type='item'), + translation_languages=Language.get_translation_languages(), + meta=meta) + + +@objects_pdf.route("/objects/pdfs", methods=['GET']) +@login_required +@login_read_only +def objects_pdfs(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + show_objects = request.args.get('show_objects') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + + if show_objects: + dict_objects = PDFs.PDFs().api_get_meta_by_daterange(date_from, date_to) + else: + dict_objects = {} + + return render_template("PDFDaterange.html", date_from=date_from, date_to=date_to, + dict_objects=dict_objects, show_objects=show_objects) + + +@objects_pdf.route("/objects/pdfs/post", methods=['POST']) +@login_required +@login_read_only +def objects_pdfs_post(): + date_from = request.form.get('date_from') + date_to = request.form.get('date_to') + show_objects = request.form.get('show_objects') + return redirect(url_for('objects_pdf.objects_pdfs', date_from=date_from, date_to=date_to, show_objects=show_objects)) + + +@objects_pdf.route("/objects/pdfs/range/json", methods=['GET']) +@login_required +@login_read_only +def objects_pdfs_range_json(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + return jsonify(PDFs.PDFs().api_get_chart_nb_by_daterange(date_from, date_to)) diff --git a/var/www/templates/chats_explorer/block_message.html b/var/www/templates/chats_explorer/block_message.html index 1d724c86e..a47b30c3a 100644 --- a/var/www/templates/chats_explorer/block_message.html +++ b/var/www/templates/chats_explorer/block_message.html @@ -169,7 +169,11 @@

{{ file_name }} {% for obj in message['files'][file_name] %}
- {{ loop.index }} + {% if obj['type'] == 'pdf' %} + {{ loop.index }} + {% else %} + {{ loop.index }} + {% endif %} {% if obj['tags'] %}
{% for tag in obj['tags'] %} @@ -180,9 +184,17 @@

{% endfor %}

{% else %} - + {% if message['files'][file_name][0]['type'] == 'pdf' %} + + {% else %} + + {% endif %}
- {{ file_name }} + {% if message['files'][file_name][0]['type'] == 'pdf' %} + {{ file_name }} + {% else %} + {{ file_name }} + {% endif %} {% if message['files'][file_name][0]['tags'] %}
{% for tag in message['files'][file_name][0]['tags'] %} diff --git a/var/www/templates/chats_explorer/block_translation_post.html b/var/www/templates/chats_explorer/block_translation_post.html new file mode 100644 index 000000000..567784a4a --- /dev/null +++ b/var/www/templates/chats_explorer/block_translation_post.html @@ -0,0 +1,81 @@ + \ No newline at end of file diff --git a/var/www/templates/correlation/show_correlation.html b/var/www/templates/correlation/show_correlation.html index b7482141f..2dd8dd77f 100644 --- a/var/www/templates/correlation/show_correlation.html +++ b/var/www/templates/correlation/show_correlation.html @@ -136,6 +136,10 @@ {% include 'objects/ocr/card_ocr.html' %} {% elif dict_object["object_type"] == "barcode" %} {% include 'objects/barcode/card_barcode.html' %} + {% elif dict_object["object_type"] == "pdf" %} + {% include 'objects/pdf/card_pdf.html' %} + {% elif dict_object["object_type"] == "author" %} + {% include 'objects/author/card_author.html' %} {% elif dict_object["object_type"] == "qrcode" %} {% include 'objects/qrcode/card_qrcode.html' %} {% elif dict_object["object_type"] == "item" %} @@ -248,15 +252,28 @@
Hidden objects:
    -
  • Select Correlation
  • +
  • + Select Correlation: +
    + + + + + + +
    +
  • -
  • +
  • -
    @@ -305,6 +322,10 @@
    Hidden objects:
    +
    + + +
    @@ -347,6 +368,10 @@
    Hidden objects:
    +
    + + +
    @@ -384,10 +409,21 @@
    Hidden objects:
    - +
    +
    +
    +
    + + +
    +
    +
    +
    + +
    +
    -
  • @@ -522,6 +558,19 @@

    Tags All Objects

    $('#ltagsgalaxies').val(tagsgalaxy); return true; } + +function set_level_zero() { + document.getElementById('level').value = 0; +} + +function select_object_type_selector() { + document.querySelectorAll('#correlation_type_selector input[type="checkbox"]').forEach(cb => cb.checked = true); +} + +function unselect_object_type_selector() { + document.querySelectorAll('#correlation_type_selector input[type="checkbox"]').forEach(cb => cb.checked = false); +} + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
    +
    + + {% include 'sidebars/sidebar_objects.html' %} + +
    + +
    +
    +
    + +{# {% include 'image/block_images_search.html' %}#} + +
    + + +
    + +
    +
    +
    Select a date range :
    + +
    +
    + +
    +
    +
    + +
    +
    + + +
    + + +
    +
    + +
    +
    +
    +
    +
    +
    + + {% if dict_objects %} + {% if date_from|string == date_to|string %} +

    {{ date_from }} Authors Name:

    + {% else %} +

    {{ date_from }} to {{ date_to }} Authors Name:

    + {% endif %} + + + + + + + + + + + + {% for obj_id in dict_objects %} + + + + + + + + {% endfor %} + +
    First SeenLast SeenTotalLast days
    + {{ dict_objects[obj_id]['content'] }} + {{ dict_objects[obj_id]['first_seen'] }}{{ dict_objects[obj_id]['last_seen'] }}{{ dict_objects[obj_id]['nb_seen'] }}
    + + + {% else %} + {% if show_objects %} + {% if date_from|string == date_to|string %} +

    {{ date_from }}, No Author

    + {% else %} +

    {{ date_from }} to {{ date_to }}, No Author

    + {% endif %} + {% endif %} + {% endif %} +
    + +
    +
    + + + + + + + + + + + + + + + + + diff --git a/var/www/templates/objects/author/ShowAuthor.html b/var/www/templates/objects/author/ShowAuthor.html new file mode 100644 index 000000000..1de49396c --- /dev/null +++ b/var/www/templates/objects/author/ShowAuthor.html @@ -0,0 +1,68 @@ + + + + + Author - AIL + + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
    +
    + + {% include 'sidebars/sidebar_objects.html' %} + +
    + + {% with meta=meta, is_correlation=False %} + {% include 'objects/author/card_author.html' %} + {% endwith %} + +
    + +
    +
    + + + + + + diff --git a/var/www/templates/objects/author/card_author.html b/var/www/templates/objects/author/card_author.html new file mode 100644 index 000000000..59d156306 --- /dev/null +++ b/var/www/templates/objects/author/card_author.html @@ -0,0 +1,83 @@ + + + +{% with modal_add_tags=ail_tags %} + {% include 'modals/add_tags.html' %} +{% endwith %} + +{% include 'modals/edit_tag.html' %} + + + +
    +
    + {{ meta["id"] }} : +
    {{ meta["content"] }}
    +
      +
    • + + + + + + + + +
      + + + + {{ meta["svg_icon"]["icon"] }} + + + {{ meta['type'] }} + First Seen: {% if meta['first_seen'] %}{{ meta['first_seen'][0:4] }}-{{ meta['first_seen'][4:6] }}-{{ meta['first_seen'][6:8] }}{% endif %}Last Seen: {% if meta['last_seen'] %}{{ meta['last_seen'][0:4] }}-{{ meta['last_seen'][4:6] }}-{{ meta['last_seen'][6:8] }}{% endif %}
      +
    • +
    • + + +
    • + +
    • +
      + Tags: + {% for tag in meta['tags'] %} + + {% endfor %} + +
      +
    • +
    + + {% with obj_type='author', obj_id=meta['id'], obj_subtype='' %} + {% include 'modals/investigations_register_obj.html' %} + {% endwith %} + + + + {% if is_correlation %} + + + + {% else %} + + + + {% endif %} + + +
    +
    \ No newline at end of file diff --git a/var/www/templates/objects/item/show_item.html b/var/www/templates/objects/item/show_item.html index e08e7da45..de545f7c3 100644 --- a/var/www/templates/objects/item/show_item.html +++ b/var/www/templates/objects/item/show_item.html @@ -124,7 +124,12 @@
    Custom Meta
    {% if meta['father'] %}
    - Father: {{meta['father']}} + Father: + {% if meta['source'] == 'pdf' %} + {{meta['father'][5:]}} + {% else %} + {{meta['father']}} + {% endif %}
    {% endif %} diff --git a/var/www/templates/objects/pdf/PDFDaterange.html b/var/www/templates/objects/pdf/PDFDaterange.html new file mode 100644 index 000000000..297b70867 --- /dev/null +++ b/var/www/templates/objects/pdf/PDFDaterange.html @@ -0,0 +1,604 @@ + + + + + PDFs - AIL + + + + + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
    +
    + + {% include 'sidebars/sidebar_objects.html' %} + +
    + +
    +
    +
    + +{# {% include 'image/block_images_search.html' %}#} + +
    + + +
    + +
    +
    +
    Select a date range :
    +
    +
    +
    + +
    +
    +
    + +
    +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    +
    +
    + + {% if dict_objects %} + {% if date_from|string == date_to|string %} +

    {{ date_from }} PDFs:

    + {% else %} +

    {{ date_from }} to {{ date_to }} PDFs Name:

    + {% endif %} + + + + + + + + + + + + {% for obj_id in dict_objects %} + + + + + + + + {% endfor %} + +
    First SeenLast SeenTotalLast days
    + {{ obj_id }} + {{ dict_objects[obj_id]['first_seen'] }}{{ dict_objects[obj_id]['last_seen'] }}{{ dict_objects[obj_id]['nb_seen'] }}
    + + + {% else %} + {% if show_objects %} + {% if date_from|string == date_to|string %} +

    {{ date_from }}, No PDF

    + {% else %} +

    {{ date_from }} to {{ date_to }}, No PDF

    + {% endif %} + {% endif %} + {% endif %} +
    + +
    +
    + + + + + + + + + + + + + + + + + diff --git a/var/www/templates/objects/pdf/ShowPDF.html b/var/www/templates/objects/pdf/ShowPDF.html new file mode 100644 index 000000000..280dd3dc4 --- /dev/null +++ b/var/www/templates/objects/pdf/ShowPDF.html @@ -0,0 +1,131 @@ + + + + + PDF - AIL + + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
    +
    + + {% include 'sidebars/sidebar_objects.html' %} + +
    + + {% with meta=meta, is_correlation=False %} + {% include 'objects/pdf/card_pdf.html' %} + {% endwith %} + +
    + +
    +
    + + + + + + diff --git a/var/www/templates/objects/pdf/card_pdf.html b/var/www/templates/objects/pdf/card_pdf.html new file mode 100644 index 000000000..c5543ae36 --- /dev/null +++ b/var/www/templates/objects/pdf/card_pdf.html @@ -0,0 +1,126 @@ + + + +{% with modal_add_tags=ail_tags %} + {% include 'modals/add_tags.html' %} +{% endwith %} + +{% include 'modals/edit_tag.html' %} + +
    +
    +
    PDF original SHA-256: {{ meta["id"] }}
    + + {% if translation_languages %} +
    + {% if 'translated' in meta %} + {% set translated=meta['translated'] %} + {% else %} + {% set translated=none %} + {% endif %} + {% with translate_url=url_for('objects_pdf.pdf_translate'), obj_id=meta['id'], translation_name="Translate PDF", translated=translated %} + {% include 'chats_explorer/block_translation_post.html' %} + {% endwith %} +
    + {% endif %} +
      +
    • + + + + + + + + +
      + + + + {{ meta["svg_icon"]["icon"] }} + + + {{ meta['type'] }} + First Seen: {% if meta['first_seen'] %}{{ meta['first_seen'][0:4] }}-{{ meta['first_seen'][4:6] }}-{{ meta['first_seen'][6:8] }}{% endif %}Last Seen: {% if meta['last_seen'] %}{{ meta['last_seen'][0:4] }}-{{ meta['last_seen'][4:6] }}-{{ meta['last_seen'][6:8] }}{% endif %}
      +
    • +
    • +
      File Names:
      +
        + {% for file_name in meta['file-names'] %} +
      • {{ file_name }}
      • + {% endfor %} +
      +
    • + {% if 'author' in meta %} + {% if meta['author'] %} +
    • +
      Author:
      +
        +
      • {{ meta['author'] }}
      • +
      +
    • + {% endif %} + {% endif %} + {% if 'file-meta' in meta %} + {% if meta['file-meta'] %} +
    • +
      File Metadata:
      + + + {% for f_meta in meta['file-meta'] %} + + + + + {% endfor %} + +
      {{ f_meta }}:{{ meta['file-meta'][f_meta] }}
      +
    • + {% endif %} + {% endif %} +
    • +
      + Tags: + {% for tag in meta['tags'] %} + + {% endfor %} + +
      +
    • +
    + + {% with obj_type='pdf', obj_id=meta['id'], obj_subtype='' %} + {% include 'modals/investigations_register_obj.html' %} + {% endwith %} + + + + {% if is_correlation %} + + + + {% else %} + + + + {% endif %} + + +
    +
    \ No newline at end of file diff --git a/var/www/templates/sidebars/sidebar_objects.html b/var/www/templates/sidebars/sidebar_objects.html index b4624c6f1..afc13e7e8 100644 --- a/var/www/templates/sidebars/sidebar_objects.html +++ b/var/www/templates/sidebars/sidebar_objects.html @@ -94,6 +94,18 @@
    File Name
  • + +