-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy paththird_pass.py
More file actions
46 lines (39 loc) · 1.71 KB
/
third_pass.py
File metadata and controls
46 lines (39 loc) · 1.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import glob
import csv
import os
def get_all_urls():
models = glob.glob('./data/links/*.tars.csv')
urls = []
for model in models:
with open(model, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
headers = next(reader, None)
current_list = [rows[0].strip() for rows in reader]
urls.extend(current_list)
print(f'{len(urls)} links to tar files, deduplicated it\'s {len(set(urls))}')
return urls
def write_all_urls(urls):
with open('./data/urls.txt','w',newline="\n") as download_list:
#wow, python
#https://stackoverflow.com/a/50223435/229631
download_list.writelines(u + '\n' for u in set(urls))
def write_new_urls(urls):
with open('./data/urls.txt', 'r') as existing_urls_fp:
with open('./data/urls_new.txt', 'w+') as new_urls_fp:
existing_urls = [x.strip() for x in existing_urls_fp.readlines()]
new_urls_set = set(urls) # Yes yes, we're already deduplicated
existing_urls_set = set(existing_urls)
# https://stackoverflow.com/questions/40185258/find-elements-not-in-the-intersection-of-two-lists
actually_new_urls = new_urls_set ^ existing_urls_set
print("Existing: ", len(existing_urls_set))
print("New: ", len(new_urls_set))
print("Unique: ", len(actually_new_urls))
new_urls_fp.writelines(u + '\n' for u in actually_new_urls)
if __name__ == '__main__':
urls = get_all_urls()
if os.path.exists("./data/urls.txt"):
print("urls.txt exists, only dumping new links")
write_new_urls(urls)
else:
print("no urls.txt, dumping everything")
write_all_urls(urls)