-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjinyongselenium.py
More file actions
104 lines (82 loc) · 3.38 KB
/
jinyongselenium.py
File metadata and controls
104 lines (82 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import json, os, pprint, time
from urllib import parse
options = webdriver.ChromeOptions()
# options.add_argument("--headless") #不開啟實體瀏覽器背景執行
options.add_argument('--start-maximized')
options.add_argument('--incognito')
options.add_argument('--disable-popup-blocking')
options.add_argument('enable-web-bluetooth-scanning')
driver = webdriver.Chrome(executable_path=r'C:\Users\yantaishih\PycharmProjects\jinyong\chromedriver.exe',options = options)
listData = []
url = 'https://www.bookwormzz.com/zh/'
def visit():
driver.get(url)
def getMainLinks():
a_elms = driver.find_elements(By.CSS_SELECTOR, 'a[data-ajax="false"]')
for a in a_elms:
listData.append({
"title": a.get_attribute('innerText'),
"link": parse.unquote(a.get_attribute('href'))
})
def getSubLinks():
for i in range( len(listData) ):
if "sub" not in listData[i]:
listData[i]['sub'] = []
driver.get(listData[i]["link"] + "#book_toc")
try:
WebDriverWait(driver, 3).until(
EC.presence_of_element_located(
( By.CSS_SELECTOR, 'a[rel="external"][class="ui-link"]')
)
)
a_elms = driver.find_elements(By.CSS_SELECTOR, 'a[rel="external"][class="ui-link"]')
for a in a_elms:
listData[i]["sub"].append({
"sub_title": a.get_attribute("innerText"),
"sub_link":parse.unquote( a.get_attribute("href") )
})
except TimeoutException as e:
continue
def close():
driver.quit()
def saveJson():
fp = open("jinyongselenium.json", "w", encoding ="utf-8")
fp.write( json.dumps(listData, ensure_ascii=False) )
fp.close()
def writeTxt():
listContent = []
fp = open("jinyongselenium.json", "r",encoding="utf-8")
strJson = fp.read()
folderPath = 'jinyongselenium_txt'
if not os.path.exists(folderPath):
os.makedirs(folderPath)
listResult = json.loads(strJson)
for i in range(len(listResult)):
for j in range(len(listResult[i]['sub'])):
driver.get( listResult[i]['sub'][j]['sub_link'])
div = driver.find_element(By.CSS_SELECTOR, 'div#html > div')
strContent = div.get_attribute('innerText')
strContent = strContent.replace(" ","")
strContent = strContent.replace("\r", "")
strContent = strContent.replace("\n", "")
strContent = strContent.replace(" ", "")
fileName = f"{listResult[i]['title']} {listResult[i]['sub'][j]['sub_title']}.txt"
fp = open(f"{folderPath}/{fileName}", "w", encoding="utf-8")
fp.write(strContent)
fp.close()
listContent.append(strContent)
fp = open("trainselenium.json", "w", encoding="utf-8")
fp.write(json.dumps(listContent, ensure_ascii=False))
fp.close()
if __name__ == '__main__':
visit()
getMainLinks()
getSubLinks()
saveJson()
writeTxt()
close()