From dfbce67cde703fd3853099ccacc0600e5637c307 Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Tue, 16 Apr 2024 14:09:14 -0400 Subject: [PATCH 1/3] Handle YouTube Shorts 1- Remove shorts duplicates from database 2- Ignore short videos when downloading playlists and channels --- cps/tasks/metadata_extract.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index 4f6ec53e96..2bb201c922 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -56,6 +56,10 @@ def _execute_subprocess(self, subprocess_args): self.message = f"{self.media_url_link} failed: {e}" return None + def _remove_shorts_from_db(self, conn): + conn.execute("DELETE FROM media WHERE path LIKE '%shorts%'") + conn.commit() + def _fetch_requested_urls(self, conn): try: cursor = conn.execute("PRAGMA table_info(media)") @@ -100,6 +104,9 @@ def _send_shelf_title(self): except Exception as e: log.error("An error occurred during the shelf title sending: %s", e) + def _ignore_shorts(self, requested_urls): + requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if requested_urls[url]["duration"] > 60} + def _update_metadata(self, requested_urls): failed_urls = [] subprocess_args_list = [[os.getenv("LB_WRAPPER", "lb-wrapper"), "tubeadd", requested_url] for requested_url in requested_urls.keys()] @@ -117,7 +124,7 @@ def _update_metadata(self, requested_urls): self.message = f"{subprocess_args[2]} failed: {e}" failed_urls.append(subprocess_args[2]) - requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if "shorts" not in url and url not in failed_urls} + requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if url not in failed_urls} def _calculate_views_per_day(self, requested_urls, conn): now = datetime.now() @@ -160,6 +167,7 @@ def run(self, worker_thread): return with sqlite3.connect(XKLB_DB_FILE) as conn: + self._remove_shorts_from_db(conn) requested_urls = self._fetch_requested_urls(conn) if not requested_urls: return @@ -168,6 +176,7 @@ def run(self, worker_thread): self._get_shelf_title(conn) if any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]): self._send_shelf_title() + self._ignore_shorts(requested_urls) self._update_metadata(requested_urls) self._calculate_views_per_day(requested_urls, conn) requested_urls = self._sort_and_limit_requested_urls(requested_urls) From 78ce6c374607a491d2c031f295dd6bc89931c266 Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Tue, 16 Apr 2024 16:08:33 -0400 Subject: [PATCH 2/3] Revert _ignore_shorts --- cps/tasks/metadata_extract.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index 2bb201c922..f5ef0dbfa5 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -103,9 +103,6 @@ def _send_shelf_title(self): log.error("Received unexpected status code %s while sending the shelf title to %s", response.status_code, self.original_url) except Exception as e: log.error("An error occurred during the shelf title sending: %s", e) - - def _ignore_shorts(self, requested_urls): - requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if requested_urls[url]["duration"] > 60} def _update_metadata(self, requested_urls): failed_urls = [] @@ -124,7 +121,7 @@ def _update_metadata(self, requested_urls): self.message = f"{subprocess_args[2]} failed: {e}" failed_urls.append(subprocess_args[2]) - requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if url not in failed_urls} + requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if "shorts" not in url and url not in failed_urls} def _calculate_views_per_day(self, requested_urls, conn): now = datetime.now() @@ -176,7 +173,6 @@ def run(self, worker_thread): self._get_shelf_title(conn) if any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]): self._send_shelf_title() - self._ignore_shorts(requested_urls) self._update_metadata(requested_urls) self._calculate_views_per_day(requested_urls, conn) requested_urls = self._sort_and_limit_requested_urls(requested_urls) From ad095bd5681b3b6090bc2b713864abb5baf1dc35 Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Tue, 16 Apr 2024 16:10:26 -0400 Subject: [PATCH 3/3] Remove blank space --- cps/tasks/metadata_extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index f5ef0dbfa5..e0f0f6bc40 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -103,7 +103,7 @@ def _send_shelf_title(self): log.error("Received unexpected status code %s while sending the shelf title to %s", response.status_code, self.original_url) except Exception as e: log.error("An error occurred during the shelf title sending: %s", e) - + def _update_metadata(self, requested_urls): failed_urls = [] subprocess_args_list = [[os.getenv("LB_WRAPPER", "lb-wrapper"), "tubeadd", requested_url] for requested_url in requested_urls.keys()]