selenium_crawl_tsv.py - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repos… | |
git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fz… | |
Log | |
Files | |
Refs | |
Tags | |
--- | |
selenium_crawl_tsv.py (3006B) | |
--- | |
1 from selenium import webdriver | |
2 from selenium.webdriver.common.by import By | |
3 | |
4 from selenium.webdriver.firefox.options import Options | |
5 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |
6 | |
7 import sys | |
8 from datetime import datetime | |
9 | |
10 def make_escape_content_trans(): | |
11 m = {} | |
12 for i in range(0, 32): | |
13 m[i] = "" | |
14 m[0x7f] = "" # DEL | |
15 # replace | |
16 m["\\"] = "\\\\" | |
17 m["\n"] = "\\n" | |
18 m["\t"] = "\\t" | |
19 | |
20 return str.maketrans(m) | |
21 | |
22 def make_escape_field_trans(): | |
23 m = {} | |
24 for i in range(0, 32): | |
25 m[i] = "" | |
26 m[0x7f] = "" # DEL | |
27 # replace | |
28 m["\n"] = " " | |
29 m["\t"] = " " | |
30 | |
31 return str.maketrans(m) | |
32 | |
33 escape_content_tbl = make_escape_content_trans() | |
34 escape_field_tbl = make_escape_field_trans() | |
35 | |
36 def escape_content(s): | |
37 return s.translate(escape_content_tbl).strip() | |
38 | |
39 def escape_field(s): | |
40 return s.translate(escape_field_tbl).strip() | |
41 | |
42 if len(sys.argv) > 1: | |
43 url = sys.argv[1] | |
44 else: | |
45 print("usage: <url>") | |
46 sys.exit(1) | |
47 | |
48 options = Options() | |
49 options.add_argument("--headless") | |
50 | |
51 # use existing profile: | |
52 | |
53 #options.add_argument("--profile") | |
54 #profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release" | |
55 # NOTE: must not be running at the same time. | |
56 #options.add_argument(profile_path) | |
57 #options.set_preference("profile", profile_path) | |
58 | |
59 # setup custom profile: | |
60 # JS disabled | |
61 options.set_preference("javascript.enabled", False) | |
62 # disable stylesheet | |
63 options.set_preference("permissions.default.stylesheet", 2) | |
64 # disable image loading | |
65 options.set_preference("permissions.default.image", 2) | |
66 # override user-agent. | |
67 #options.set_preference("general.useragent.override", "whatever you want… | |
68 | |
69 driver = webdriver.Firefox(options=options) | |
70 | |
71 # set timeouts | |
72 #driver.implicitly_wait(10) | |
73 | |
74 # get the page | |
75 driver.get(url) | |
76 | |
77 # print page title | |
78 #print(driver.title) | |
79 | |
80 #pagesource = driver.execute_script("return document.body.InnerHTML;") | |
81 #print(pagesource) | |
82 #print(driver.page_source) | |
83 #outer_html = driver.find_element(By.XPATH, "//body").get_attribute("out… | |
84 | |
85 #outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("ou… | |
86 #print(outer_html) | |
87 | |
88 # show all links on a page | |
89 #links = driver.find_elements(By.TAG_NAME, "a") | |
90 anchors = driver.find_elements(By.CSS_SELECTOR, "main a") | |
91 links = [] | |
92 for anchor in anchors: | |
93 href = anchor.get_attribute("href") | |
94 text = anchor.text | |
95 if len(href): | |
96 links.append({"href": href, "text": text}) | |
97 | |
98 for link in links: | |
99 driver.get(link["href"]) | |
100 | |
101 # parse timestamp. | |
102 time = driver.find_element(By.TAG_NAME, "time") | |
103 ts = datetime.strptime(time.text, "%Y-%m-%d") | |
104 ts = int(ts.timestamp()) | |
105 | |
106 content = driver.find_element(By.CSS_SELECTOR, "article").get_attrib… | |
107 title = driver.title | |
108 title = title.replace(" - Codemadness", "") | |
109 | |
110 # escape fields | |
111 content = escape_content(content) | |
112 title = escape_field(title) | |
113 link = escape_field(link["href"]) | |
114 | |
115 print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content)) | |
116 | |
117 driver.close() | |
118 driver.quit() |