Introduction
Introduction Statistics Contact Development Disclaimer Help
Add selenium to tsv example from bob. - brcon2023-hackathons - Bitreichcon 2023…
git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fz…
Log
Files
Refs
Tags
---
commit d2f3f8bf36e6d7b0d88f7d3e02353bcd87a93795
parent a7cd0c547c792f74b7784cc0a8c806380a28ca2f
Author: Christoph Lohmann <[email protected]>
Date: Thu, 10 Aug 2023 16:14:57 +0200
Add selenium to tsv example from bob.
Diffstat:
A sfeed-atom/selenium_crawl_tsv.py | 118 +++++++++++++++++++++++++++++…
1 file changed, 118 insertions(+), 0 deletions(-)
---
diff --git a/sfeed-atom/selenium_crawl_tsv.py b/sfeed-atom/selenium_crawl_tsv.py
@@ -0,0 +1,118 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+
+import sys
+from datetime import datetime
+
+def make_escape_content_trans():
+ m = {}
+ for i in range(0, 32):
+ m[i] = ""
+ m[0x7f] = "" # DEL
+ # replace
+ m["\\"] = "\\\\"
+ m["\n"] = "\\n"
+ m["\t"] = "\\t"
+
+ return str.maketrans(m)
+
+def make_escape_field_trans():
+ m = {}
+ for i in range(0, 32):
+ m[i] = ""
+ m[0x7f] = "" # DEL
+ # replace
+ m["\n"] = " "
+ m["\t"] = " "
+
+ return str.maketrans(m)
+
+escape_content_tbl = make_escape_content_trans()
+escape_field_tbl = make_escape_field_trans()
+
+def escape_content(s):
+ return s.translate(escape_content_tbl).strip()
+
+def escape_field(s):
+ return s.translate(escape_field_tbl).strip()
+
+if len(sys.argv) > 1:
+ url = sys.argv[1]
+else:
+ print("usage: <url>")
+ sys.exit(1)
+
+options = Options()
+options.add_argument("--headless")
+
+# use existing profile:
+
+#options.add_argument("--profile")
+#profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release"
+# NOTE: must not be running at the same time.
+#options.add_argument(profile_path)
+#options.set_preference("profile", profile_path)
+
+# setup custom profile:
+# JS disabled
+options.set_preference("javascript.enabled", False)
+# disable stylesheet
+options.set_preference("permissions.default.stylesheet", 2)
+# disable image loading
+options.set_preference("permissions.default.image", 2)
+# override user-agent.
+#options.set_preference("general.useragent.override", "whatever you want")
+
+driver = webdriver.Firefox(options=options)
+
+# set timeouts
+#driver.implicitly_wait(10)
+
+# get the page
+driver.get(url)
+
+# print page title
+#print(driver.title)
+
+#pagesource = driver.execute_script("return document.body.InnerHTML;")
+#print(pagesource)
+#print(driver.page_source)
+#outer_html = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML…
+
+#outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("outerHTM…
+#print(outer_html)
+
+# show all links on a page
+#links = driver.find_elements(By.TAG_NAME, "a")
+anchors = driver.find_elements(By.CSS_SELECTOR, "main a")
+links = []
+for anchor in anchors:
+ href = anchor.get_attribute("href")
+ text = anchor.text
+ if len(href):
+ links.append({"href": href, "text": text})
+
+for link in links:
+ driver.get(link["href"])
+
+ # parse timestamp.
+ time = driver.find_element(By.TAG_NAME, "time")
+ ts = datetime.strptime(time.text, "%Y-%m-%d")
+ ts = int(ts.timestamp())
+
+ content = driver.find_element(By.CSS_SELECTOR, "article").get_attribute("o…
+ title = driver.title
+ title = title.replace(" - Codemadness", "")
+
+ # escape fields
+ content = escape_content(content)
+ title = escape_field(title)
+ link = escape_field(link["href"])
+
+ print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content))
+
+driver.close()
+driver.quit()
You are viewing proxied material from bitreich.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.