diff --git a/cps/constants.py b/cps/constants.py index ee907f3dd4..cb69aa614f 100644 --- a/cps/constants.py +++ b/cps/constants.py @@ -48,6 +48,12 @@ # an initial metadata manifest (prior to downloading videos or media) here: XKLB_DB_FILE = "/library/calibre-web/xklb-metadata.db" +# Maximum number of videos to download when adding a new video playlist +MAX_VIDEOS_PER_DOWNLOAD = 100 + +# Maximum number of gigabytes to download when adding a new video playlist +MAX_GB_PER_DOWNLOAD = 10 + if HOME_CONFIG: home_dir = os.path.join(os.path.expanduser("~"), ".calibre-web") if not os.path.exists(home_dir): diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py index 9abe094257..795c439cfa 100644 --- a/cps/tasks/metadata_extract.py +++ b/cps/tasks/metadata_extract.py @@ -5,7 +5,7 @@ from datetime import datetime from flask_babel import lazy_gettext as N_, gettext as _ -from cps.constants import XKLB_DB_FILE +from cps.constants import XKLB_DB_FILE, MAX_VIDEOS_PER_DOWNLOAD from cps.services.worker import WorkerThread from cps.tasks.download import TaskDownload from cps.services.worker import CalibreTask, STAT_FINISH_SUCCESS, STAT_FAIL, STAT_STARTED, STAT_WAITING @@ -29,7 +29,6 @@ def __init__(self, task_message, media_url, original_url, current_user_name): self.columns = None self.shelf_title = None self.shelf_id = None - self.playlist_id = None self.main_message = None def run(self, worker_thread): @@ -48,13 +47,12 @@ def run(self, worker_thread): subprocess_args = [lb_executable, "tubeadd", self.media_url] log.info("Subprocess args: %s", subprocess_args) - # Execute the download process using process_open + # Execute the metadata fetching process using process_open try: p = process_open(subprocess_args, newlines=True) - p.wait() self_main_message = f"{self.media_url_link}" - self.message = self_main_message + self.message = self_main_message + "..." # Database operations requested_urls = {} @@ -63,11 +61,10 @@ def run(self, worker_thread): cursor = conn.execute("PRAGMA table_info(media)") self.columns = [column[1] for column in cursor.fetchall()] if "error" in self.columns: - rows = conn.execute("SELECT path, duration FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall() + rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall() else: - rows = conn.execute("SELECT path, duration FROM media WHERE path LIKE 'http%'").fetchall() + rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE path LIKE 'http%'").fetchall() - # Abort if there are no urls if not rows: log.info("No urls found in the database") error = conn.execute("SELECT error, webpath FROM media WHERE error IS NOT NULL AND webpath = ?", (self.media_url,)).fetchone() @@ -80,13 +77,27 @@ def run(self, worker_thread): for row in rows: path = row[0] duration = row[1] + time_uploaded = row[2] + view_count = row[3] + size = row[4] + + time_uploaded = datetime.utcfromtimestamp(time_uploaded) + now = datetime.now() + days_since_publish = (now - time_uploaded).days or 1 + views_per_day = view_count / days_since_publish + is_playlist_video = False if "playlists_id" in self.columns: playlist_id = conn.execute("SELECT playlists_id FROM media WHERE path = ?", (path,)).fetchone() if playlist_id: - is_playlist_video = True + is_playlist_video = True + requested_urls[path] = { "duration": duration, + "time_uploaded": time_uploaded, + "view_count": view_count, + "size": size, + "views_per_day": views_per_day, "is_playlist_video": is_playlist_video } @@ -95,36 +106,40 @@ def run(self, worker_thread): self.message = f"{self.media_url_link} failed: {db_error}" # get the shelf title - if any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]): + if "list=" in self.media_url or "@" in self.media_url or any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]): + url_part = self.media_url.split("/")[-1] + if "list=" in url_part: + url_part = url_part.split("list=")[-1] + try: + self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (url_part,)).fetchone()[0] + except sqlite3.Error as db_error: + log.error("An error occurred while trying to connect to the database: %s", db_error) + elif "@" in url_part: + self.shelf_title = url_part.split("@")[-1] + else: + self.shelf_title = "Unnamed Bookshelf" + response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title}) + if response.status_code == 200: + self.shelf_id = response.json()["shelf_id"] + else: + log.error("An error occurred while trying to send the shelf title to %s", self.original_url) + + # remove shorts from the requested_urls dict + requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if "shorts" not in url} + + # sort the videos by views per day and get the top ones (up to the maximum number of videos per download or the length of the dictionary) + requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(MAX_VIDEOS_PER_DOWNLOAD, len(requested_urls))]) + log.debug("Videos sorted by views per day: \n%s", "\n".join([f"{index + 1}-{conn.execute('SELECT title FROM media WHERE path = ?', (requested_url,)).fetchone()[0]}:{requested_urls[requested_url]['views_per_day']}" for index, requested_url in enumerate(requested_urls.keys())])) + else: try: - self.playlist_id = self.media_url.split("/")[-1] - if "list=" in self.playlist_id: - self.playlist_id = self.playlist_id.split("list=")[-1] - self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (self.playlist_id,)).fetchone()[0] - elif "@" in self.playlist_id: - self.shelf_title = self.playlist_id.split("@")[-1] - else: - self.shelf_title = "Unnamed Bookshelf" + extractor_id = conn.execute("SELECT extractor_id FROM media WHERE ? LIKE '%' || extractor_id || '%'", (self.media_url,)).fetchone()[0] + requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if extractor_id in url} # filter the requested_urls dict except sqlite3.Error as db_error: - if "no such table: playlists" in str(db_error): - log.info("No playlists table found in the database") - self.playlist_id = None - else: - log.error("An error occurred while trying to connect to the database: %s", db_error) - self.message = f"{self.media_url_link} failed to download: {db_error}" - self.progress = 0 - finally: - log.info("Shelf title: %s", self.shelf_title) + log.error("An error occurred while trying to connect to the database: %s", db_error) + self.message = f"{self.media_url_link} failed to download: {db_error}" conn.close() - if self.shelf_title: - response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title}) - if response.status_code == 200: - self.shelf_id = response.json()["shelf_id"] - else: - log.error("An error occurred while trying to send the shelf title to %s", self.original_url) - num_requested_urls = len(requested_urls.keys()) total_duration = 0 @@ -135,14 +150,17 @@ def run(self, worker_thread): ) WorkerThread.add(self.current_user_name, task_download) - self.progress = (index + 1) / num_requested_urls if requested_urls[requested_url]["duration"] is not None: total_duration += requested_urls[requested_url]["duration"] self.message = self_main_message + f"

Number of Videos: {index + 1}/{num_requested_urls}
Total Duration: {datetime.utcfromtimestamp(total_duration).strftime('%H:%M:%S')}" + self.progress = (index + 1) / num_requested_urls + + self.end_time = datetime.now() except Exception as e: log.error("An error occurred during the subprocess execution: %s", e) self.message = f"{self.media_url_link} failed: {e}" + self.end_time = datetime.now() finally: if p.returncode == 0 or self.progress == 1.0: diff --git a/scripts/lb-wrapper b/scripts/lb-wrapper index a779492ab0..1b883e65f4 100755 --- a/scripts/lb-wrapper +++ b/scripts/lb-wrapper @@ -47,7 +47,7 @@ fi # fetching metadata. This will prevent hanging for playlist URLs or short URLs. # "...to be able to list videos that are not downloaded yet" if [[ $XKLB_INTERNAL_CMD == "tubeadd" ]]; then - xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force ${VERBOSITY}" + xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force --extra ${VERBOSITY}" elif [[ $XKLB_INTERNAL_CMD == "dl" ]]; then xklb_full_cmd="${XKLB_EXECUTABLE} dl ${XKLB_DB_FILE} --prefix ${TMP_DOWNLOADS_DIR} --video --search ${URL} ${FORMAT_OPTIONS} --write-thumbnail ${VERBOSITY}" else