Add selenium to tsv example from bob. - brcon2023-hackathons - Bitreichcon 2023… | |
git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fz… | |
Log | |
Files | |
Refs | |
Tags | |
--- | |
commit d2f3f8bf36e6d7b0d88f7d3e02353bcd87a93795 | |
parent a7cd0c547c792f74b7784cc0a8c806380a28ca2f | |
Author: Christoph Lohmann <[email protected]> | |
Date: Thu, 10 Aug 2023 16:14:57 +0200 | |
Add selenium to tsv example from bob. | |
Diffstat: | |
A sfeed-atom/selenium_crawl_tsv.py | 118 +++++++++++++++++++++++++++++… | |
1 file changed, 118 insertions(+), 0 deletions(-) | |
--- | |
diff --git a/sfeed-atom/selenium_crawl_tsv.py b/sfeed-atom/selenium_crawl_tsv.py | |
@@ -0,0 +1,118 @@ | |
+from selenium import webdriver | |
+from selenium.webdriver.common.by import By | |
+ | |
+from selenium.webdriver.firefox.options import Options | |
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |
+ | |
+import sys | |
+from datetime import datetime | |
+ | |
+def make_escape_content_trans(): | |
+ m = {} | |
+ for i in range(0, 32): | |
+ m[i] = "" | |
+ m[0x7f] = "" # DEL | |
+ # replace | |
+ m["\\"] = "\\\\" | |
+ m["\n"] = "\\n" | |
+ m["\t"] = "\\t" | |
+ | |
+ return str.maketrans(m) | |
+ | |
+def make_escape_field_trans(): | |
+ m = {} | |
+ for i in range(0, 32): | |
+ m[i] = "" | |
+ m[0x7f] = "" # DEL | |
+ # replace | |
+ m["\n"] = " " | |
+ m["\t"] = " " | |
+ | |
+ return str.maketrans(m) | |
+ | |
+escape_content_tbl = make_escape_content_trans() | |
+escape_field_tbl = make_escape_field_trans() | |
+ | |
+def escape_content(s): | |
+ return s.translate(escape_content_tbl).strip() | |
+ | |
+def escape_field(s): | |
+ return s.translate(escape_field_tbl).strip() | |
+ | |
+if len(sys.argv) > 1: | |
+ url = sys.argv[1] | |
+else: | |
+ print("usage: <url>") | |
+ sys.exit(1) | |
+ | |
+options = Options() | |
+options.add_argument("--headless") | |
+ | |
+# use existing profile: | |
+ | |
+#options.add_argument("--profile") | |
+#profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release" | |
+# NOTE: must not be running at the same time. | |
+#options.add_argument(profile_path) | |
+#options.set_preference("profile", profile_path) | |
+ | |
+# setup custom profile: | |
+# JS disabled | |
+options.set_preference("javascript.enabled", False) | |
+# disable stylesheet | |
+options.set_preference("permissions.default.stylesheet", 2) | |
+# disable image loading | |
+options.set_preference("permissions.default.image", 2) | |
+# override user-agent. | |
+#options.set_preference("general.useragent.override", "whatever you want") | |
+ | |
+driver = webdriver.Firefox(options=options) | |
+ | |
+# set timeouts | |
+#driver.implicitly_wait(10) | |
+ | |
+# get the page | |
+driver.get(url) | |
+ | |
+# print page title | |
+#print(driver.title) | |
+ | |
+#pagesource = driver.execute_script("return document.body.InnerHTML;") | |
+#print(pagesource) | |
+#print(driver.page_source) | |
+#outer_html = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML… | |
+ | |
+#outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("outerHTM… | |
+#print(outer_html) | |
+ | |
+# show all links on a page | |
+#links = driver.find_elements(By.TAG_NAME, "a") | |
+anchors = driver.find_elements(By.CSS_SELECTOR, "main a") | |
+links = [] | |
+for anchor in anchors: | |
+ href = anchor.get_attribute("href") | |
+ text = anchor.text | |
+ if len(href): | |
+ links.append({"href": href, "text": text}) | |
+ | |
+for link in links: | |
+ driver.get(link["href"]) | |
+ | |
+ # parse timestamp. | |
+ time = driver.find_element(By.TAG_NAME, "time") | |
+ ts = datetime.strptime(time.text, "%Y-%m-%d") | |
+ ts = int(ts.timestamp()) | |
+ | |
+ content = driver.find_element(By.CSS_SELECTOR, "article").get_attribute("o… | |
+ title = driver.title | |
+ title = title.replace(" - Codemadness", "") | |
+ | |
+ # escape fields | |
+ content = escape_content(content) | |
+ title = escape_field(title) | |
+ link = escape_field(link["href"]) | |
+ | |
+ print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content)) | |
+ | |
+driver.close() | |
+driver.quit() |