Introduction
Introduction Statistics Contact Development Disclaimer Help
selenium_crawl_tsv.py - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repos…
git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fz…
Log
Files
Refs
Tags
---
selenium_crawl_tsv.py (3006B)
---
1 from selenium import webdriver
2 from selenium.webdriver.common.by import By
3
4 from selenium.webdriver.firefox.options import Options
5 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
6
7 import sys
8 from datetime import datetime
9
10 def make_escape_content_trans():
11 m = {}
12 for i in range(0, 32):
13 m[i] = ""
14 m[0x7f] = "" # DEL
15 # replace
16 m["\\"] = "\\\\"
17 m["\n"] = "\\n"
18 m["\t"] = "\\t"
19
20 return str.maketrans(m)
21
22 def make_escape_field_trans():
23 m = {}
24 for i in range(0, 32):
25 m[i] = ""
26 m[0x7f] = "" # DEL
27 # replace
28 m["\n"] = " "
29 m["\t"] = " "
30
31 return str.maketrans(m)
32
33 escape_content_tbl = make_escape_content_trans()
34 escape_field_tbl = make_escape_field_trans()
35
36 def escape_content(s):
37 return s.translate(escape_content_tbl).strip()
38
39 def escape_field(s):
40 return s.translate(escape_field_tbl).strip()
41
42 if len(sys.argv) > 1:
43 url = sys.argv[1]
44 else:
45 print("usage: <url>")
46 sys.exit(1)
47
48 options = Options()
49 options.add_argument("--headless")
50
51 # use existing profile:
52
53 #options.add_argument("--profile")
54 #profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release"
55 # NOTE: must not be running at the same time.
56 #options.add_argument(profile_path)
57 #options.set_preference("profile", profile_path)
58
59 # setup custom profile:
60 # JS disabled
61 options.set_preference("javascript.enabled", False)
62 # disable stylesheet
63 options.set_preference("permissions.default.stylesheet", 2)
64 # disable image loading
65 options.set_preference("permissions.default.image", 2)
66 # override user-agent.
67 #options.set_preference("general.useragent.override", "whatever you want…
68
69 driver = webdriver.Firefox(options=options)
70
71 # set timeouts
72 #driver.implicitly_wait(10)
73
74 # get the page
75 driver.get(url)
76
77 # print page title
78 #print(driver.title)
79
80 #pagesource = driver.execute_script("return document.body.InnerHTML;")
81 #print(pagesource)
82 #print(driver.page_source)
83 #outer_html = driver.find_element(By.XPATH, "//body").get_attribute("out…
84
85 #outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("ou…
86 #print(outer_html)
87
88 # show all links on a page
89 #links = driver.find_elements(By.TAG_NAME, "a")
90 anchors = driver.find_elements(By.CSS_SELECTOR, "main a")
91 links = []
92 for anchor in anchors:
93 href = anchor.get_attribute("href")
94 text = anchor.text
95 if len(href):
96 links.append({"href": href, "text": text})
97
98 for link in links:
99 driver.get(link["href"])
100
101 # parse timestamp.
102 time = driver.find_element(By.TAG_NAME, "time")
103 ts = datetime.strptime(time.text, "%Y-%m-%d")
104 ts = int(ts.timestamp())
105
106 content = driver.find_element(By.CSS_SELECTOR, "article").get_attrib…
107 title = driver.title
108 title = title.replace(" - Codemadness", "")
109
110 # escape fields
111 content = escape_content(content)
112 title = escape_field(title)
113 link = escape_field(link["href"])
114
115 print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content))
116
117 driver.close()
118 driver.quit()
You are viewing proxied material from bitreich.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.