diff --git a/cps/constants.py b/cps/constants.py
index ee907f3dd4..cb69aa614f 100644
--- a/cps/constants.py
+++ b/cps/constants.py
@@ -48,6 +48,12 @@
# an initial metadata manifest (prior to downloading videos or media) here:
XKLB_DB_FILE = "/library/calibre-web/xklb-metadata.db"
+# Maximum number of videos to download when adding a new video playlist
+MAX_VIDEOS_PER_DOWNLOAD = 100
+
+# Maximum number of gigabytes to download when adding a new video playlist
+MAX_GB_PER_DOWNLOAD = 10
+
if HOME_CONFIG:
home_dir = os.path.join(os.path.expanduser("~"), ".calibre-web")
if not os.path.exists(home_dir):
diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py
index 9abe094257..795c439cfa 100644
--- a/cps/tasks/metadata_extract.py
+++ b/cps/tasks/metadata_extract.py
@@ -5,7 +5,7 @@
from datetime import datetime
from flask_babel import lazy_gettext as N_, gettext as _
-from cps.constants import XKLB_DB_FILE
+from cps.constants import XKLB_DB_FILE, MAX_VIDEOS_PER_DOWNLOAD
from cps.services.worker import WorkerThread
from cps.tasks.download import TaskDownload
from cps.services.worker import CalibreTask, STAT_FINISH_SUCCESS, STAT_FAIL, STAT_STARTED, STAT_WAITING
@@ -29,7 +29,6 @@ def __init__(self, task_message, media_url, original_url, current_user_name):
self.columns = None
self.shelf_title = None
self.shelf_id = None
- self.playlist_id = None
self.main_message = None
def run(self, worker_thread):
@@ -48,13 +47,12 @@ def run(self, worker_thread):
subprocess_args = [lb_executable, "tubeadd", self.media_url]
log.info("Subprocess args: %s", subprocess_args)
- # Execute the download process using process_open
+ # Execute the metadata fetching process using process_open
try:
p = process_open(subprocess_args, newlines=True)
-
p.wait()
self_main_message = f"{self.media_url_link}"
- self.message = self_main_message
+ self.message = self_main_message + "..."
# Database operations
requested_urls = {}
@@ -63,11 +61,10 @@ def run(self, worker_thread):
cursor = conn.execute("PRAGMA table_info(media)")
self.columns = [column[1] for column in cursor.fetchall()]
if "error" in self.columns:
- rows = conn.execute("SELECT path, duration FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall()
+ rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall()
else:
- rows = conn.execute("SELECT path, duration FROM media WHERE path LIKE 'http%'").fetchall()
+ rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE path LIKE 'http%'").fetchall()
- # Abort if there are no urls
if not rows:
log.info("No urls found in the database")
error = conn.execute("SELECT error, webpath FROM media WHERE error IS NOT NULL AND webpath = ?", (self.media_url,)).fetchone()
@@ -80,13 +77,27 @@ def run(self, worker_thread):
for row in rows:
path = row[0]
duration = row[1]
+ time_uploaded = row[2]
+ view_count = row[3]
+ size = row[4]
+
+ time_uploaded = datetime.utcfromtimestamp(time_uploaded)
+ now = datetime.now()
+ days_since_publish = (now - time_uploaded).days or 1
+ views_per_day = view_count / days_since_publish
+
is_playlist_video = False
if "playlists_id" in self.columns:
playlist_id = conn.execute("SELECT playlists_id FROM media WHERE path = ?", (path,)).fetchone()
if playlist_id:
- is_playlist_video = True
+ is_playlist_video = True
+
requested_urls[path] = {
"duration": duration,
+ "time_uploaded": time_uploaded,
+ "view_count": view_count,
+ "size": size,
+ "views_per_day": views_per_day,
"is_playlist_video": is_playlist_video
}
@@ -95,36 +106,40 @@ def run(self, worker_thread):
self.message = f"{self.media_url_link} failed: {db_error}"
# get the shelf title
- if any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]):
+ if "list=" in self.media_url or "@" in self.media_url or any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]):
+ url_part = self.media_url.split("/")[-1]
+ if "list=" in url_part:
+ url_part = url_part.split("list=")[-1]
+ try:
+ self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (url_part,)).fetchone()[0]
+ except sqlite3.Error as db_error:
+ log.error("An error occurred while trying to connect to the database: %s", db_error)
+ elif "@" in url_part:
+ self.shelf_title = url_part.split("@")[-1]
+ else:
+ self.shelf_title = "Unnamed Bookshelf"
+ response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title})
+ if response.status_code == 200:
+ self.shelf_id = response.json()["shelf_id"]
+ else:
+ log.error("An error occurred while trying to send the shelf title to %s", self.original_url)
+
+ # remove shorts from the requested_urls dict
+ requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if "shorts" not in url}
+
+ # sort the videos by views per day and get the top ones (up to the maximum number of videos per download or the length of the dictionary)
+ requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(MAX_VIDEOS_PER_DOWNLOAD, len(requested_urls))])
+ log.debug("Videos sorted by views per day: \n%s", "\n".join([f"{index + 1}-{conn.execute('SELECT title FROM media WHERE path = ?', (requested_url,)).fetchone()[0]}:{requested_urls[requested_url]['views_per_day']}" for index, requested_url in enumerate(requested_urls.keys())]))
+ else:
try:
- self.playlist_id = self.media_url.split("/")[-1]
- if "list=" in self.playlist_id:
- self.playlist_id = self.playlist_id.split("list=")[-1]
- self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (self.playlist_id,)).fetchone()[0]
- elif "@" in self.playlist_id:
- self.shelf_title = self.playlist_id.split("@")[-1]
- else:
- self.shelf_title = "Unnamed Bookshelf"
+ extractor_id = conn.execute("SELECT extractor_id FROM media WHERE ? LIKE '%' || extractor_id || '%'", (self.media_url,)).fetchone()[0]
+ requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if extractor_id in url} # filter the requested_urls dict
except sqlite3.Error as db_error:
- if "no such table: playlists" in str(db_error):
- log.info("No playlists table found in the database")
- self.playlist_id = None
- else:
- log.error("An error occurred while trying to connect to the database: %s", db_error)
- self.message = f"{self.media_url_link} failed to download: {db_error}"
- self.progress = 0
- finally:
- log.info("Shelf title: %s", self.shelf_title)
+ log.error("An error occurred while trying to connect to the database: %s", db_error)
+ self.message = f"{self.media_url_link} failed to download: {db_error}"
conn.close()
- if self.shelf_title:
- response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title})
- if response.status_code == 200:
- self.shelf_id = response.json()["shelf_id"]
- else:
- log.error("An error occurred while trying to send the shelf title to %s", self.original_url)
-
num_requested_urls = len(requested_urls.keys())
total_duration = 0
@@ -135,14 +150,17 @@ def run(self, worker_thread):
)
WorkerThread.add(self.current_user_name, task_download)
- self.progress = (index + 1) / num_requested_urls
if requested_urls[requested_url]["duration"] is not None:
total_duration += requested_urls[requested_url]["duration"]
self.message = self_main_message + f"
Number of Videos: {index + 1}/{num_requested_urls}
Total Duration: {datetime.utcfromtimestamp(total_duration).strftime('%H:%M:%S')}"
+ self.progress = (index + 1) / num_requested_urls
+
+ self.end_time = datetime.now()
except Exception as e:
log.error("An error occurred during the subprocess execution: %s", e)
self.message = f"{self.media_url_link} failed: {e}"
+ self.end_time = datetime.now()
finally:
if p.returncode == 0 or self.progress == 1.0:
diff --git a/scripts/lb-wrapper b/scripts/lb-wrapper
index a779492ab0..1b883e65f4 100755
--- a/scripts/lb-wrapper
+++ b/scripts/lb-wrapper
@@ -47,7 +47,7 @@ fi
# fetching metadata. This will prevent hanging for playlist URLs or short URLs.
# "...to be able to list videos that are not downloaded yet"
if [[ $XKLB_INTERNAL_CMD == "tubeadd" ]]; then
- xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force ${VERBOSITY}"
+ xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force --extra ${VERBOSITY}"
elif [[ $XKLB_INTERNAL_CMD == "dl" ]]; then
xklb_full_cmd="${XKLB_EXECUTABLE} dl ${XKLB_DB_FILE} --prefix ${TMP_DOWNLOADS_DIR} --video --search ${URL} ${FORMAT_OPTIONS} --write-thumbnail ${VERBOSITY}"
else