From 6db215bd5bf0bc1028bb998bd2b64fc1871cc734 Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Mon, 11 Mar 2024 14:48:09 +0000 Subject: [PATCH 1/9] Sort videos by views per day --- cps/tasks/metadata_extract.py | 47 ++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index 9abe094257..952e9fc274 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -48,7 +48,7 @@ def run(self, worker_thread): subprocess_args = [lb_executable, "tubeadd", self.media_url] log.info("Subprocess args: %s", subprocess_args) - # Execute the download process using process_open + # Execute the metadata fetching process using process_open try: p = process_open(subprocess_args, newlines=True) @@ -115,16 +115,45 @@ def run(self, worker_thread): self.progress = 0 finally: log.info("Shelf title: %s", self.shelf_title) - + response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title}) + if response.status_code == 200: + self.shelf_id = response.json()["shelf_id"] + else: + log.error("An error occurred while trying to send the shelf title to %s", self.original_url) + + # update the metadata of every video in the shelf + for index, requested_url in enumerate(requested_urls.keys()): + try: + p = process_open([lb_executable, "tubeadd", requested_url], newlines=True) + p.wait() + except Exception as e: + log.error("An error occurred during updating the metadata of %s: %s", requested_url, e) + self.message = f"{requested_url} failed: {e}" + for index, requested_url in enumerate(requested_urls.keys()): + try: + view_count = conn.execute("SELECT view_count FROM media WHERE path = ?", (requested_url,)).fetchone()[0] + time_uploaded = conn.execute("SELECT time_uploaded FROM media WHERE path = ?", (requested_url,)).fetchone()[0] + time_uploaded = datetime.utcfromtimestamp(time_uploaded) + now = datetime.now() + # calculate views per day + days_since_publish = (now - time_uploaded).days + try: + requested_urls[requested_url]["views_per_day"] = view_count / days_since_publish + except ZeroDivisionError: + requested_urls[requested_url]["views_per_day"] = 0 + except Exception as e: + log.error("An error occurred during the subprocess execution: %s", e) + self.message = f"{requested_url} failed: {e}" + + # sort the videos by views per day and only keep the top ones + if len(requested_urls) > 10 and len(requested_urls) <= 50: + requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:10]) + elif len(requested_urls) > 50 and len(requested_urls) <= 100: + requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:50]) + elif len(requested_urls) > 100: + requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:100]) conn.close() - if self.shelf_title: - response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title}) - if response.status_code == 200: - self.shelf_id = response.json()["shelf_id"] - else: - log.error("An error occurred while trying to send the shelf title to %s", self.original_url) - num_requested_urls = len(requested_urls.keys()) total_duration = 0 From ae0aca8b6f1d23413e2a006c3df54f420b45e62e Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Tue, 12 Mar 2024 17:25:55 +0000 Subject: [PATCH 2/9] Add number of videos in constants --- cps/constants.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cps/constants.py b/cps/constants.py index ee907f3dd4..74c2d3c9ea 100644 --- a/cps/constants.py +++ b/cps/constants.py @@ -48,6 +48,9 @@ # an initial metadata manifest (prior to downloading videos or media) here: XKLB_DB_FILE = "/library/calibre-web/xklb-metadata.db" +# Number of videos to be downloaded (default: 100) based on views per day +NUMBER_OF_VIDEOS = 100 + if HOME_CONFIG: home_dir = os.path.join(os.path.expanduser("~"), ".calibre-web") if not os.path.exists(home_dir): From 2dfa249c921b8e22b375d7024aa44341c80a1740 Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Tue, 12 Mar 2024 17:46:51 +0000 Subject: [PATCH 3/9] Use NUMBER_OF_VIDEOS constant --- cps/tasks/metadata_extract.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index 952e9fc274..839ab81cd3 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -5,7 +5,7 @@ from datetime import datetime from flask_babel import lazy_gettext as N_, gettext as _ -from cps.constants import XKLB_DB_FILE +from cps.constants import XKLB_DB_FILE, NUMBER_OF_VIDEOS from cps.services.worker import WorkerThread from cps.tasks.download import TaskDownload from cps.services.worker import CalibreTask, STAT_FINISH_SUCCESS, STAT_FAIL, STAT_STARTED, STAT_WAITING @@ -145,13 +145,9 @@ def run(self, worker_thread): log.error("An error occurred during the subprocess execution: %s", e) self.message = f"{requested_url} failed: {e}" - # sort the videos by views per day and only keep the top ones - if len(requested_urls) > 10 and len(requested_urls) <= 50: - requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:10]) - elif len(requested_urls) > 50 and len(requested_urls) <= 100: - requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:50]) - elif len(requested_urls) > 100: - requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:100]) + # sort the videos by views per day and get the top ones (up to the NUMBER_OF_VIDEOS constant or the length of the dictionary) + requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(NUMBER_OF_VIDEOS, len(requested_urls))]) + conn.close() num_requested_urls = len(requested_urls.keys()) From 0999663fedbc7670c0396d7111c8292a1b9d357b Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Tue, 12 Mar 2024 23:42:05 +0000 Subject: [PATCH 4/9] Use MAX_VIDEOS_PER_DOWNLOAD + progress polling fix --- cps/tasks/metadata_extract.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index 839ab81cd3..b531efee69 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -5,7 +5,7 @@ from datetime import datetime from flask_babel import lazy_gettext as N_, gettext as _ -from cps.constants import XKLB_DB_FILE, NUMBER_OF_VIDEOS +from cps.constants import XKLB_DB_FILE, MAX_VIDEOS_PER_DOWNLOAD from cps.services.worker import WorkerThread from cps.tasks.download import TaskDownload from cps.services.worker import CalibreTask, STAT_FINISH_SUCCESS, STAT_FAIL, STAT_STARTED, STAT_WAITING @@ -51,7 +51,6 @@ def run(self, worker_thread): # Execute the metadata fetching process using process_open try: p = process_open(subprocess_args, newlines=True) - p.wait() self_main_message = f"{self.media_url_link}" self.message = self_main_message @@ -121,32 +120,30 @@ def run(self, worker_thread): else: log.error("An error occurred while trying to send the shelf title to %s", self.original_url) - # update the metadata of every video in the shelf + # update the metadata of the videos in the playlist for index, requested_url in enumerate(requested_urls.keys()): try: p = process_open([lb_executable, "tubeadd", requested_url], newlines=True) p.wait() + self.progress = (index + 1) / len(requested_urls) - 0.01 except Exception as e: log.error("An error occurred during updating the metadata of %s: %s", requested_url, e) self.message = f"{requested_url} failed: {e}" - for index, requested_url in enumerate(requested_urls.keys()): + for requested_url in requested_urls.keys(): try: view_count = conn.execute("SELECT view_count FROM media WHERE path = ?", (requested_url,)).fetchone()[0] time_uploaded = conn.execute("SELECT time_uploaded FROM media WHERE path = ?", (requested_url,)).fetchone()[0] time_uploaded = datetime.utcfromtimestamp(time_uploaded) now = datetime.now() # calculate views per day - days_since_publish = (now - time_uploaded).days - try: - requested_urls[requested_url]["views_per_day"] = view_count / days_since_publish - except ZeroDivisionError: - requested_urls[requested_url]["views_per_day"] = 0 + days_since_publish = (now - time_uploaded).days or 1 + requested_urls[requested_url]["views_per_day"] = view_count / days_since_publish except Exception as e: log.error("An error occurred during the subprocess execution: %s", e) self.message = f"{requested_url} failed: {e}" - # sort the videos by views per day and get the top ones (up to the NUMBER_OF_VIDEOS constant or the length of the dictionary) - requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(NUMBER_OF_VIDEOS, len(requested_urls))]) + # sort the videos by views per day and get the top ones (up to the maximum number of videos per download or the length of the dictionary) + requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(MAX_VIDEOS_PER_DOWNLOAD, len(requested_urls))]) conn.close() @@ -160,14 +157,17 @@ def run(self, worker_thread): ) WorkerThread.add(self.current_user_name, task_download) - self.progress = (index + 1) / num_requested_urls if requested_urls[requested_url]["duration"] is not None: total_duration += requested_urls[requested_url]["duration"] - self.message = self_main_message + f"

Number of Videos: {index + 1}/{num_requested_urls}
Total Duration: {datetime.utcfromtimestamp(total_duration).strftime('%H:%M:%S')}" + self.message = self_main_message + f"

Number of Videos: {index + 1}/{num_requested_urls}
Total Duration: {datetime.utcfromtimestamp(total_duration).strftime('%H:%M:%S')}" + + self.progress = 1.0 + self.end_time = datetime.now() except Exception as e: log.error("An error occurred during the subprocess execution: %s", e) self.message = f"{self.media_url_link} failed: {e}" + self.end_time = datetime.now() finally: if p.returncode == 0 or self.progress == 1.0: From 4ffde286a9abc0a1b3563a813353db0f1adb5977 Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Tue, 12 Mar 2024 23:43:48 +0000 Subject: [PATCH 5/9] Add MAX_VIDEOS_PER_DOWNLOAD + MAX_GB_PER_DOWNLOAD --- cps/constants.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cps/constants.py b/cps/constants.py index 74c2d3c9ea..cb69aa614f 100644 --- a/cps/constants.py +++ b/cps/constants.py @@ -48,8 +48,11 @@ # an initial metadata manifest (prior to downloading videos or media) here: XKLB_DB_FILE = "/library/calibre-web/xklb-metadata.db" -# Number of videos to be downloaded (default: 100) based on views per day -NUMBER_OF_VIDEOS = 100 +# Maximum number of videos to download when adding a new video playlist +MAX_VIDEOS_PER_DOWNLOAD = 100 + +# Maximum number of gigabytes to download when adding a new video playlist +MAX_GB_PER_DOWNLOAD = 10 if HOME_CONFIG: home_dir = os.path.join(os.path.expanduser("~"), ".calibre-web") From e8fb715c49f8994df7bf561e67183ae962dae04c Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Tue, 19 Mar 2024 07:44:17 -0400 Subject: [PATCH 6/9] Support downloading videos from channels --- cps/tasks/metadata_extract.py | 47 ++++++++++++++++------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index b531efee69..fb3f207bc9 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -29,7 +29,6 @@ def __init__(self, task_message, media_url, original_url, current_user_name): self.columns = None self.shelf_title = None self.shelf_id = None - self.playlist_id = None self.main_message = None def run(self, worker_thread): @@ -53,7 +52,7 @@ def run(self, worker_thread): p = process_open(subprocess_args, newlines=True) p.wait() self_main_message = f"{self.media_url_link}" - self.message = self_main_message + self.message = self_main_message + "..." # Database operations requested_urls = {} @@ -94,32 +93,27 @@ def run(self, worker_thread): self.message = f"{self.media_url_link} failed: {db_error}" # get the shelf title - if any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]): - try: - self.playlist_id = self.media_url.split("/")[-1] - if "list=" in self.playlist_id: - self.playlist_id = self.playlist_id.split("list=")[-1] - self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (self.playlist_id,)).fetchone()[0] - elif "@" in self.playlist_id: - self.shelf_title = self.playlist_id.split("@")[-1] - else: - self.shelf_title = "Unnamed Bookshelf" - except sqlite3.Error as db_error: - if "no such table: playlists" in str(db_error): - log.info("No playlists table found in the database") - self.playlist_id = None - else: + if "list=" in self.media_url or "@" in self.media_url or any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]): + url_part = self.media_url.split("/")[-1] + if "list=" in url_part: + url_part = url_part.split("list=")[-1] + try: + self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (url_part,)).fetchone()[0] + except sqlite3.Error as db_error: log.error("An error occurred while trying to connect to the database: %s", db_error) - self.message = f"{self.media_url_link} failed to download: {db_error}" - self.progress = 0 - finally: - log.info("Shelf title: %s", self.shelf_title) - response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title}) - if response.status_code == 200: - self.shelf_id = response.json()["shelf_id"] - else: - log.error("An error occurred while trying to send the shelf title to %s", self.original_url) + elif "@" in url_part: + self.shelf_title = url_part.split("@")[-1] + else: + self.shelf_title = "Unnamed Bookshelf" + response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title}) + if response.status_code == 200: + self.shelf_id = response.json()["shelf_id"] + else: + log.error("An error occurred while trying to send the shelf title to %s", self.original_url) + # remove shorts from the requested_urls dict + requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if "shorts" not in url} + # update the metadata of the videos in the playlist for index, requested_url in enumerate(requested_urls.keys()): try: @@ -144,6 +138,7 @@ def run(self, worker_thread): # sort the videos by views per day and get the top ones (up to the maximum number of videos per download or the length of the dictionary) requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(MAX_VIDEOS_PER_DOWNLOAD, len(requested_urls))]) + log.debug("Videos sorted by views per day: \n%s", "\n".join([f"{index + 1}-{conn.execute('SELECT title FROM media WHERE path = ?', (requested_url,)).fetchone()[0]}:{requested_urls[requested_url]['views_per_day']}" for index, requested_url in enumerate(requested_urls.keys())])) conn.close() From 1f0601e7e989d1cd8b7bfc5591b255a5e2774e2c Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Tue, 19 Mar 2024 07:49:49 -0400 Subject: [PATCH 7/9] Select the exact video URL when a single video is requested Useful when other users are downloading videos or testing with lb-wrapper... --- cps/tasks/metadata_extract.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index fb3f207bc9..d6d68a7284 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -139,6 +139,13 @@ def run(self, worker_thread): # sort the videos by views per day and get the top ones (up to the maximum number of videos per download or the length of the dictionary) requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(MAX_VIDEOS_PER_DOWNLOAD, len(requested_urls))]) log.debug("Videos sorted by views per day: \n%s", "\n".join([f"{index + 1}-{conn.execute('SELECT title FROM media WHERE path = ?', (requested_url,)).fetchone()[0]}:{requested_urls[requested_url]['views_per_day']}" for index, requested_url in enumerate(requested_urls.keys())])) + else: + try: + extractor_id = conn.execute("SELECT extractor_id FROM media WHERE ? LIKE '%' || extractor_id || '%'", (self.media_url,)).fetchone()[0] + requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if extractor_id in url} # filter the requested_urls dict + except sqlite3.Error as db_error: + log.error("An error occurred while trying to connect to the database: %s", db_error) + self.message = f"{self.media_url_link} failed to download: {db_error}" conn.close() From 4cef37a91382dd95c3847db620b9a4afe3843546 Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Wed, 20 Mar 2024 22:30:44 -0400 Subject: [PATCH 8/9] Get all metadata from a single 'lb tubeadd' run --- cps/tasks/metadata_extract.py | 47 ++++++++++++++--------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index d6d68a7284..795c439cfa 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -61,11 +61,10 @@ def run(self, worker_thread): cursor = conn.execute("PRAGMA table_info(media)") self.columns = [column[1] for column in cursor.fetchall()] if "error" in self.columns: - rows = conn.execute("SELECT path, duration FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall() + rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall() else: - rows = conn.execute("SELECT path, duration FROM media WHERE path LIKE 'http%'").fetchall() + rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE path LIKE 'http%'").fetchall() - # Abort if there are no urls if not rows: log.info("No urls found in the database") error = conn.execute("SELECT error, webpath FROM media WHERE error IS NOT NULL AND webpath = ?", (self.media_url,)).fetchone() @@ -78,13 +77,27 @@ def run(self, worker_thread): for row in rows: path = row[0] duration = row[1] + time_uploaded = row[2] + view_count = row[3] + size = row[4] + + time_uploaded = datetime.utcfromtimestamp(time_uploaded) + now = datetime.now() + days_since_publish = (now - time_uploaded).days or 1 + views_per_day = view_count / days_since_publish + is_playlist_video = False if "playlists_id" in self.columns: playlist_id = conn.execute("SELECT playlists_id FROM media WHERE path = ?", (path,)).fetchone() if playlist_id: - is_playlist_video = True + is_playlist_video = True + requested_urls[path] = { "duration": duration, + "time_uploaded": time_uploaded, + "view_count": view_count, + "size": size, + "views_per_day": views_per_day, "is_playlist_video": is_playlist_video } @@ -113,28 +126,6 @@ def run(self, worker_thread): # remove shorts from the requested_urls dict requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if "shorts" not in url} - - # update the metadata of the videos in the playlist - for index, requested_url in enumerate(requested_urls.keys()): - try: - p = process_open([lb_executable, "tubeadd", requested_url], newlines=True) - p.wait() - self.progress = (index + 1) / len(requested_urls) - 0.01 - except Exception as e: - log.error("An error occurred during updating the metadata of %s: %s", requested_url, e) - self.message = f"{requested_url} failed: {e}" - for requested_url in requested_urls.keys(): - try: - view_count = conn.execute("SELECT view_count FROM media WHERE path = ?", (requested_url,)).fetchone()[0] - time_uploaded = conn.execute("SELECT time_uploaded FROM media WHERE path = ?", (requested_url,)).fetchone()[0] - time_uploaded = datetime.utcfromtimestamp(time_uploaded) - now = datetime.now() - # calculate views per day - days_since_publish = (now - time_uploaded).days or 1 - requested_urls[requested_url]["views_per_day"] = view_count / days_since_publish - except Exception as e: - log.error("An error occurred during the subprocess execution: %s", e) - self.message = f"{requested_url} failed: {e}" # sort the videos by views per day and get the top ones (up to the maximum number of videos per download or the length of the dictionary) requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(MAX_VIDEOS_PER_DOWNLOAD, len(requested_urls))]) @@ -161,9 +152,9 @@ def run(self, worker_thread): if requested_urls[requested_url]["duration"] is not None: total_duration += requested_urls[requested_url]["duration"] - self.message = self_main_message + f"

Number of Videos: {index + 1}/{num_requested_urls}
Total Duration: {datetime.utcfromtimestamp(total_duration).strftime('%H:%M:%S')}" + self.message = self_main_message + f"

Number of Videos: {index + 1}/{num_requested_urls}
Total Duration: {datetime.utcfromtimestamp(total_duration).strftime('%H:%M:%S')}" + self.progress = (index + 1) / num_requested_urls - self.progress = 1.0 self.end_time = datetime.now() except Exception as e: From 574a681c24b006a5b759d29c9f9f4a874e0cde92 Mon Sep 17 00:00:00 2001 From: Blondel MONDESIR Date: Wed, 20 Mar 2024 22:34:10 -0400 Subject: [PATCH 9/9] Align with xklb v2.5.018 --extra option make key metadata available without needing to download --- scripts/lb-wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lb-wrapper b/scripts/lb-wrapper index a779492ab0..1b883e65f4 100755 --- a/scripts/lb-wrapper +++ b/scripts/lb-wrapper @@ -47,7 +47,7 @@ fi # fetching metadata. This will prevent hanging for playlist URLs or short URLs. # "...to be able to list videos that are not downloaded yet" if [[ $XKLB_INTERNAL_CMD == "tubeadd" ]]; then - xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force ${VERBOSITY}" + xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force --extra ${VERBOSITY}" elif [[ $XKLB_INTERNAL_CMD == "dl" ]]; then xklb_full_cmd="${XKLB_EXECUTABLE} dl ${XKLB_DB_FILE} --prefix ${TMP_DOWNLOADS_DIR} --video --search ${URL} ${FORMAT_OPTIONS} --write-thumbnail ${VERBOSITY}" else