1+ #!/usr/bin/env python3
2+
3+ import html
4+ import re
5+ import sys
6+ import urllib .error
7+ import urllib .request
8+ from datetime import datetime , timezone
9+ from html .parser import HTMLParser
10+
11+
12+ class TextExtractor (HTMLParser ):
13+ def __init__ (self ):
14+ super ().__init__ ()
15+ self .parts = []
16+ self .skip_depth = 0
17+
18+ def handle_starttag (self , tag , attrs ):
19+ if tag in {"script" , "style" }:
20+ self .skip_depth += 1
21+ return
22+
23+ if self .skip_depth > 0 :
24+ return
25+
26+ if tag == "br" :
27+ self .parts .append ("\n " )
28+ elif tag == "li" :
29+ self .parts .append ("\n - " )
30+ elif tag in {"p" , "div" , "tr" , "section" , "article" , "header" , "ul" , "ol" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" }:
31+ self .parts .append ("\n " )
32+ elif tag in {"td" , "th" }:
33+ self .parts .append (" " )
34+
35+ def handle_endtag (self , tag ):
36+ if tag in {"script" , "style" }:
37+ self .skip_depth = max (0 , self .skip_depth - 1 )
38+ return
39+
40+ if self .skip_depth > 0 :
41+ return
42+
43+ if tag in {"p" , "div" , "tr" , "section" , "article" , "header" , "ul" , "ol" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" }:
44+ self .parts .append ("\n " )
45+
46+ def handle_data (self , data ):
47+ if self .skip_depth == 0 :
48+ self .parts .append (data )
49+
50+ def text (self ):
51+ return html .unescape ("" .join (self .parts ))
52+
53+
54+ def fetch_page (base_url , headers , page_number ):
55+ request = urllib .request .Request (
56+ f"{ base_url } ?p={ page_number } " ,
57+ headers = headers ,
58+ )
59+ with urllib .request .urlopen (request , timeout = 30 ) as response :
60+ return response .read ().decode ("utf-8" , "replace" )
61+
62+
63+ def html_to_text (fragment ):
64+ parser = TextExtractor ()
65+ parser .feed (fragment )
66+ parser .close ()
67+ text = parser .text ()
68+ text = text .replace ("\r \n " , "\n " ).replace ("\r " , "\n " )
69+ text = re .sub (r"(?i)\[/?(?:b|i|u|s|quote|code|list|olist|h\d)\]" , "" , text )
70+ text = re .sub (r"(?is)\[url=([^\]]+)\](.*?)\[/url\]" , r"\2 (\1)" , text )
71+ text = re .sub (r"(?is)\[img\].*?\[/img\]" , "" , text )
72+ text = re .sub (r"[ \t]+\n" , "\n " , text )
73+ text = re .sub (r"\n{3,}" , "\n \n " , text )
74+ return text .strip ()
75+
76+
77+ def parse_timestamp (date_text , raw_html , raw_search_start ):
78+ raw_index = raw_html .find (date_text , raw_search_start )
79+ if raw_index != - 1 :
80+ snippet = raw_html [max (0 , raw_index - 600 ):raw_index + 600 ]
81+ attr_match = re .search (r'data-(?:timestamp|rtime(?:_updated)?|time_updated)="(\d+)"' , snippet )
82+ if attr_match :
83+ return int (attr_match .group (1 )), raw_index + len (date_text )
84+
85+ # Handle date format without year (e.g., "21 Mar @ 1:19pm")
86+ # Add current year; if parsing fails in the future, adjust to previous year
87+ from datetime import datetime as dt_module
88+ current_year = dt_module .now (timezone .utc ).year
89+ try :
90+ parsed = datetime .strptime (f"{ date_text } { current_year } " , "%d %b @ %I:%M%p %Y" )
91+ except ValueError :
92+ # If current year doesn't work, try previous year
93+ try :
94+ parsed = datetime .strptime (f"{ date_text } { current_year - 1 } " , "%d %b @ %I:%M%p %Y" )
95+ except ValueError :
96+ # Fallback: try the old format just in case
97+ try :
98+ parsed = datetime .strptime (date_text , "%d %b, %Y @ %I:%M%p" )
99+ except ValueError :
100+ # If all else fails, return a safe fallback
101+ parsed = datetime .now (timezone .utc )
102+
103+ parsed = parsed .replace (tzinfo = timezone .utc )
104+ return int (parsed .timestamp ()), raw_index + len (date_text ) if raw_index != - 1 else raw_search_start
105+
106+
107+ def main ():
108+ if len (sys .argv ) != 3 :
109+ print ("Usage: steam_changelog_scraper.py <workshop_id> <previous_ts>" , file = sys .stderr )
110+ return 1
111+
112+ workshop_id = sys .argv [1 ]
113+ previous_ts = int (sys .argv [2 ])
114+
115+ base_url = f"https://steamcommunity.com/sharedfiles/filedetails/changelog/{ workshop_id } "
116+ headers = {
117+ "User-Agent" : (
118+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
119+ "(KHTML, like Gecko) Chrome/122.0 Safari/537.36"
120+ ),
121+ "Accept-Language" : "en-US,en;q=0.9" ,
122+ }
123+
124+ header_re = re .compile (
125+ r"Update:\s+(?P<date>\d{1,2}\s+[A-Za-z]{3}\s+@\s+\d{1,2}:\d{2}[ap]m)\s+by\s+(?P<author>.*?)(?:\n|$)"
126+ )
127+
128+ entries = []
129+ page_number = 1
130+
131+ try :
132+ while page_number <= 50 :
133+ raw_html = fetch_page (base_url , headers , page_number )
134+ page_text = html_to_text (raw_html )
135+ matches = list (header_re .finditer (page_text ))
136+
137+ if not matches :
138+ break
139+
140+ raw_search_start = 0
141+ for index , match in enumerate (matches ):
142+ next_start = matches [index + 1 ].start () if index + 1 < len (matches ) else len (page_text )
143+ body = page_text [match .end ():next_start ].strip ()
144+
145+ footer_split = re .split (
146+ r"\n(?:Showing\s+\d+-\d+\s+of\s+\d+\s+entries|Additional Links)\b" ,
147+ body ,
148+ maxsplit = 1 ,
149+ )
150+ body = footer_split [0 ].strip ()
151+
152+ entry_ts , raw_search_start = parse_timestamp (match .group ("date" ), raw_html , raw_search_start )
153+ if entry_ts <= previous_ts :
154+ raise StopIteration
155+
156+ if not body :
157+ body = "No changelog details were provided for this update."
158+
159+ entry_date = datetime .fromtimestamp (entry_ts , tz = timezone .utc ).strftime ("%Y-%m-%d %H:%M UTC" )
160+ entries .append (f"### { entry_date } \n \n { body } " )
161+
162+ page_number += 1
163+ except StopIteration :
164+ pass
165+ except (urllib .error .URLError , TimeoutError , ValueError , OSError ):
166+ entries = []
167+
168+ if entries :
169+ print ("\n \n " .join (entries ))
170+ else :
171+ print ("No changelog entries were found on Steam since the previous saved timestamp." )
172+
173+ return 0
174+
175+
176+ if __name__ == "__main__" :
177+ raise SystemExit (main ())
0 commit comments