Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import asyncio
import json
import os
Comment thread
Ovler-Young marked this conversation as resolved.

import httpx
import pandas as pd
import streamlit as st

from search import fetch_all_user_details, format_register_time, search_users_api
from takeout import (
API_BASE,
download_notes_data,
gen_markdown,
post_process,
write_user_bak_meta,
)

st.title("画吧作品备份工具")

# ── 搜索 ──────────────────────────────────────────────
st.header("搜索用户")

keyword = st.text_input("用户名关键词", key="keyword")
if st.button("搜索") and keyword:
async def _search():
async with httpx.AsyncClient(timeout=60) as client:
users = await search_users_api(client, keyword)
return await fetch_all_user_details(client, users)

with st.spinner("搜索中..."):
users = asyncio.run(_search())

exact = sorted([u for u in users if u.is_exact_match], key=lambda u: u.notes_count, reverse=True)
other = sorted([u for u in users if not u.is_exact_match], key=lambda u: u.notes_count, reverse=True)
st.session_state["search_results"] = exact + other

if "search_results" in st.session_state:
users = st.session_state["search_results"]
df = pd.DataFrame([{
"精确匹配": "⭐" if u.is_exact_match else "",
"用户名": u.authorname,
"作品数": u.notes_count,
"注册时间": format_register_time(u.register_time),
"JID": u.jid,
} for u in users])
st.dataframe(df, width='stretch')
Comment thread
Ovler-Young marked this conversation as resolved.

# ── 导出 ──────────────────────────────────────────────
st.header("导出备份")

jid = st.text_input("JID", value=st.session_state.get("export_jid", ""), key="export_jid_input")
from_local = st.checkbox("从本地 notes.json 加载(跳过 API 拉取)")

if st.button("开始导出") and jid:
async def _export():
async with httpx.AsyncClient(timeout=300) as client:
if from_local:
usr_dir = jid.split("@")[0]
with open(f"user_backups/{usr_dir}/notes.json", "r") as f:

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.

Copilot Autofix

AI 2 months ago

In general, to fix uncontrolled path usage you should (1) define a safe base directory, (2) derive the user-specified component in a restricted/normalized manner, and (3) construct the final path using os.path.join and validate that the normalized result is still inside the base directory. Optionally, further restrict the user-derived component (e.g., only allow certain characters).

For this specific code, the best minimal fix is:

  • Define a constant base directory for all backups, e.g. BASE_BACKUP_DIR = "user_backups".
  • Sanitize usr_dir so it cannot contain path separators or traversal like "..". A simple, non-breaking approach is to strip it down to a whitelist of safe characters (letters, digits, underscore, dash), and fall back to an error if the result is empty.
  • Build the path using os.path.join(BASE_BACKUP_DIR, safe_usr_dir, "notes.json").
  • Normalize the path with os.path.normpath and verify that it starts with the normalized base backup directory path (using os.path.abspath or similar), raising an exception or showing an error if not.

All of this can be done directly in app.py, just above the open(...) call. We already import os, so no new imports are needed. We should keep the behavior of using the part before "@" for usr_dir but ensure it’s sanitized before use and checked against the base directory. For minimal impact and clarity inside this snippet, we can implement a small helper function (within the shown region) to construct and validate the backup path, then use it in place of the f-string path at line 59.

Suggested changeset 1
app.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/app.py b/app.py
--- a/app.py
+++ b/app.py
@@ -52,11 +52,24 @@
 from_local = st.checkbox("从本地 notes.json 加载(跳过 API 拉取)")
 
 if st.button("开始导出") and jid:
+    def _get_safe_notes_path(jid_value: str) -> str:
+        # Use the part before "@" as user directory, as before
+        raw_usr_dir = jid_value.split("@")[0]
+        # Restrict to a safe subset of characters to avoid path traversal and separators
+        safe_usr_dir = "".join(ch for ch in raw_usr_dir if ch.isalnum() or ch in ("-", "_"))
+        if not safe_usr_dir:
+            raise ValueError("Invalid JID: user directory is empty or contains only unsafe characters")
+        base_dir = os.path.abspath("user_backups")
+        candidate = os.path.abspath(os.path.normpath(os.path.join(base_dir, safe_usr_dir, "notes.json")))
+        if not candidate.startswith(base_dir + os.sep):
+            raise ValueError("Invalid path derived from JID")
+        return candidate
+
     async def _export():
         async with httpx.AsyncClient(timeout=300) as client:
             if from_local:
-                usr_dir = jid.split("@")[0]
-                with open(f"user_backups/{usr_dir}/notes.json", "r") as f:
+                notes_path = _get_safe_notes_path(jid)
+                with open(notes_path, "r") as f:
                     notes = json.load(f)
             else:
                 r = await client.get(API_BASE + "notes", params={"jid": jid})
EOF
@@ -52,11 +52,24 @@
from_local = st.checkbox("从本地 notes.json 加载(跳过 API 拉取)")

if st.button("开始导出") and jid:
def _get_safe_notes_path(jid_value: str) -> str:
# Use the part before "@" as user directory, as before
raw_usr_dir = jid_value.split("@")[0]
# Restrict to a safe subset of characters to avoid path traversal and separators
safe_usr_dir = "".join(ch for ch in raw_usr_dir if ch.isalnum() or ch in ("-", "_"))
if not safe_usr_dir:
raise ValueError("Invalid JID: user directory is empty or contains only unsafe characters")
base_dir = os.path.abspath("user_backups")
candidate = os.path.abspath(os.path.normpath(os.path.join(base_dir, safe_usr_dir, "notes.json")))
if not candidate.startswith(base_dir + os.sep):
raise ValueError("Invalid path derived from JID")
return candidate

async def _export():
async with httpx.AsyncClient(timeout=300) as client:
if from_local:
usr_dir = jid.split("@")[0]
with open(f"user_backups/{usr_dir}/notes.json", "r") as f:
notes_path = _get_safe_notes_path(jid)
with open(notes_path, "r") as f:
notes = json.load(f)
else:
r = await client.get(API_BASE + "notes", params={"jid": jid})
Copilot is powered by AI and may make mistakes. Always verify output.
notes = json.load(f)
Comment on lines +58 to +60
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

jid (user input) is used to derive usr_dir and then interpolated into filesystem paths (e.g., user_backups/{usr_dir}/...). Without validation, a crafted jid containing path separators could write/read outside the intended backup directory. Please sanitize/validate usr_dir (e.g., allowlist characters like [A-Za-z0-9_-] and reject anything containing /, \\, or ..) before using it in paths.

Copilot uses AI. Check for mistakes.
else:
r = await client.get(API_BASE + "notes", params={"jid": jid})
notes = r.json()
write_user_bak_meta(jid, notes)
await download_notes_data(client, jid, notes)
gen_markdown(jid, notes)

with st.spinner("下载中(进度见终端)..."):
asyncio.run(_export())

with st.spinner("生成压缩包..."):
post_process(jid)

usr_dir = jid.split("@")[0]
st.success("导出完成!")
st.write(f"备份预览:https://huabar-takeout-preview.saveweb.org/{usr_dir}/notes.html")
st.write(f"备份:https://huabar-takeout-preview.saveweb.org/{usr_dir}.zip")
st.write("您好。请尽快下载压缩包,链接将于数周后失效。祝好。")
20 changes: 20 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,23 @@ services:
environment:
- MONGODB_URI=${MONGODB_URI}
restart: always

takeout_ui:
image: ghcr.io/astral-sh/uv:python3.13-alpine
container_name: huabar-takeout-ui
working_dir: /app
ports:
- "127.0.0.63:8501:8501"
volumes:
- .:/app
- uv-cache:/root/.cache/uv
environment:
- API_BASE=http://draws_index:8080/api/
command: sh -c "apk add --no-cache pandoc zip && uv run streamlit run app.py --server.address=0.0.0.0 --server.baseUrlPath=/ui"
depends_on:
Comment on lines +24 to +25
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Installing pandoc and zip via apk add in the container command means every restart requires network access and adds startup latency (and can fail if mirrors are unavailable). Consider baking these OS packages into a small custom image (Dockerfile) and running uv sync/uv run without mutating the container at runtime.

Copilot uses AI. Check for mistakes.
- draws_index
restart: always
mem_limit: 1G

volumes:
uv-cache:
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ requires-python = ">=3.13"
dependencies = [
"filetype>=1.2",
"httpx>=0.28.1",
"pandas>=2.3.3",
"pymongo>=4.15.4",
"streamlit>=1.55.0",
"tqdm>=4.67.1",
]
4 changes: 3 additions & 1 deletion search.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import asyncio
import os

import httpx
from datetime import datetime, timezone, timedelta
from typing import List
import csv

API_BASE = "http://127.0.0.63:8539/api/"
API_BASE = os.environ.get("API_BASE", "http://127.0.0.63:8539/api/")
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

API_BASE is now read from the environment, but downstream code concatenates endpoints via API_BASE + "search"/"notes", which will break if the env var omits the trailing /. Consider normalizing API_BASE (e.g., rstrip('/') + '/') or using urljoin when constructing request URLs.

Suggested change
API_BASE = os.environ.get("API_BASE", "http://127.0.0.63:8539/api/")
API_BASE = os.environ.get("API_BASE", "http://127.0.0.63:8539/api/").rstrip("/") + "/"

Copilot uses AI. Check for mistakes.


class UserInfo:
Expand Down
2 changes: 1 addition & 1 deletion takeout.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
get_urltype
)

API_BASE = "http://127.0.0.63:8539/api/"
API_BASE = os.environ.get("API_BASE", "http://127.0.0.63:8539/api/")
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since API_BASE is now configurable via API_BASE, the code assumes the provided value ends with a trailing / (because calls do API_BASE + "notes", etc.). To avoid subtle misconfiguration bugs, normalize this value (e.g., API_BASE = API_BASE.rstrip('/') + '/') or build URLs via urllib.parse.urljoin.

Suggested change
API_BASE = os.environ.get("API_BASE", "http://127.0.0.63:8539/api/")
API_BASE = os.environ.get("API_BASE", "http://127.0.0.63:8539/api/")
API_BASE = API_BASE.rstrip("/") + "/"

Copilot uses AI. Check for mistakes.

def is_keyable(url: str)->bool:
if not url:
Expand All @@ -34,16 +34,16 @@

def find_existing_file(path: str) -> str | None:
"""查找 path 或 path.* 是否已存在,返回找到的路径或 None。"""
if os.path.exists(path):

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
return path
matches = glob_mod.glob(path + ".*")
return matches[0] if matches else None

def write_user_bak_meta(jid: str, notes: list[dict]):
usr_dir = jid.split('@')[0]
os.makedirs(f'user_backups/{usr_dir}', exist_ok=True)

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.

with open(f'user_backups/{usr_dir}/notes.json', 'w') as f:

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
f.write(json.dumps(notes, ensure_ascii=False, indent=2))

async def download_to_bak(sem:asyncio.Semaphore, client:httpx.AsyncClient, url, jid, key):
Expand All @@ -51,15 +51,15 @@
path = f'user_backups/{usr_dir}/notes_data/{key}'
if find_existing_file(path):
return
os.makedirs(f'user_backups/{usr_dir}/notes_data/', exist_ok=True)

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
async with sem:
r = await client.get(url, follow_redirects=True)
r.raise_for_status()
with open(path, 'wb') as f:

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
f.write(r.content)
ext = detect_image_ext(path)
if ext:
os.rename(path, path + ext)

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.

async def get_zipname(client:httpx.AsyncClient, key:str):
# qiniu-draw-20240127-072800.3565.zip
Expand Down Expand Up @@ -87,8 +87,8 @@
usr_dir = jid.split('@')[0]
subprocess.run(['pandoc', f'user_backups/{usr_dir}/notes.md', '--standalone', '--output', f'user_backups/{usr_dir}/notes.html'], check=True)
# rm ${jid}.zip
if os.path.exists(f'user_backups/{usr_dir}.zip'):

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
os.unlink(f'user_backups/{usr_dir}.zip')

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
# zip -r $jid $jid
subprocess.run(['zip', '-r', f'user_backups/{usr_dir}.zip', f'user_backups/{usr_dir}'], check=True)

Expand Down Expand Up @@ -121,8 +121,8 @@

def gen_markdown(jid, notes):
usr_dir = jid.split('@')[0]
os.makedirs(f'user_backups/{usr_dir}', exist_ok=True)

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
with open(f'user_backups/{usr_dir}/notes.md', 'w') as f:

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
f.write(f"""\
# {notes[0]['payload']['authorname']} 的画吧作品备份

Expand Down
Loading
Loading