Add example selenium script for the atom hackathon. - brcon2023-hackathons - Bi… | |
git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fz… | |
Log | |
Files | |
Refs | |
Tags | |
--- | |
commit a7cd0c547c792f74b7784cc0a8c806380a28ca2f | |
parent 2922c09dc4919dcea4ac331bbaa4e373ba4ccc4a | |
Author: Christoph Lohmann <[email protected]> | |
Date: Thu, 10 Aug 2023 16:10:01 +0200 | |
Add example selenium script for the atom hackathon. | |
Diffstat: | |
A sfeed-atom/kvssachsen2atom | 121 +++++++++++++++++++++++++++++… | |
1 file changed, 121 insertions(+), 0 deletions(-) | |
--- | |
diff --git a/sfeed-atom/kvssachsen2atom b/sfeed-atom/kvssachsen2atom | |
@@ -0,0 +1,121 @@ | |
+#!/usr/bin/env python | |
+# coding=utf-8 | |
+# | |
+# Copy me if you can. | |
+# by 20h | |
+# | |
+ | |
+import os | |
+import sys | |
+import getopt | |
+ | |
+from selenium import webdriver | |
+from selenium.webdriver.chrome.options import Options as chromeoptions | |
+from selenium.webdriver.support.ui import WebDriverWait | |
+from selenium.webdriver.support import expected_conditions as EC | |
+from selenium.webdriver.common.by import By | |
+ | |
+from datetime import datetime | |
+import pytz | |
+ | |
+def usage(app): | |
+ app = os.path.basename(app) | |
+ sys.stderr.write("usage: %s [-h] URI\n" % (app)) | |
+ sys.exit(1) | |
+ | |
+def main(args): | |
+ try: | |
+ opts, largs = getopt.getopt(args[1:], "h") | |
+ except getopt.GetoptError as err: | |
+ print(str(err)) | |
+ usage(args[0]) | |
+ | |
+ for o, a in opts: | |
+ if o == "-h": | |
+ usage(args[0]) | |
+ else: | |
+ assert False, "unhandled option" | |
+ | |
+ if len(largs) < 1: | |
+ usage(args[0]) | |
+ | |
+ link = largs[0] | |
+ | |
+ options = chromeoptions() | |
+ chromearguments = [ | |
+ "headless", | |
+ "no-sandbox", | |
+ "disable-extensions", | |
+ "disable-dev-shm-usage", | |
+ "start-maximized", | |
+ "window-size=1900,1080", | |
+ "disable-gpu" | |
+ ] | |
+ for carg in chromearguments: | |
+ options.add_argument(carg) | |
+ | |
+ driver = webdriver.Chrome(options=options) | |
+ driver.get(link) | |
+ | |
+ isnews = WebDriverWait(driver=driver, timeout=60).until( | |
+ EC.presence_of_element_located((By.XPATH, | |
+ "//div[@data-last-letter]") | |
+ ) | |
+ ) | |
+ newslist = driver.find_elements(By.XPATH, "//div[@data-filter-target=\… | |
+ | |
+ title = driver.find_elements(By.XPATH, "//meta[@property=\"og:title\"]… | |
+ description = title | |
+ globaltags = "" | |
+ | |
+ print("""<?xml version="1.0" encoding="utf-8"?>""") | |
+ print("""<feed xmlns="http://www.w3.org/2005/Atom">""") | |
+ print("\t<title><![CDATA[%s]]></title>" % (title)) | |
+ print("\t<subtitle><![CDATA[%s]]></subtitle>" % (description)) | |
+ print("\t<id>%s</id>" % (link)) | |
+ print("\t<link href=\"%s\" rel=\"self\" />" % (link)) | |
+ print("\t<link href=\"%s\" />" % (link)) | |
+ | |
+ utcnow = datetime.now(pytz.utc) | |
+ print("\t<updated>%s</updated>" % (utcnow.isoformat())) | |
+ | |
+ articles = newslist.find_elements(By.XPATH, "./div") | |
+ baselink = "/".join(link.split("/", 3)[:-1]) | |
+ for article in articles[::-1]: | |
+ link = article.find_elements(By.XPATH, "./a")[0] | |
+ plink = link.get_attribute("href") | |
+ if not plink.startswith("http"): | |
+ plink = "%s/%s" % (baselink, plink) | |
+ ptitle = link.get_attribute("data-title") | |
+ pcontent = article.text | |
+ pauthor = "[email protected]" | |
+ | |
+ # Normalize datetime. | |
+ updateds = article.find_elements(By.XPATH, ".//time")[0].text | |
+ try: | |
+ dtupdated = datetime.strptime(updateds, "%d.%m.%Y") | |
+ except ValueError: | |
+ continue | |
+ | |
+ dtupdated = dtupdated.replace(hour=12, minute=0,\ | |
+ second=0, tzinfo=pytz.utc) | |
+ if dtupdated.year > utcnow.year: | |
+ dtupdated = dtupdated.replace(year=utcnow.year) | |
+ pupdated = dtupdated | |
+ | |
+ print("\t<entry>") | |
+ print("\t\t<id>%s</id>" % (plink)) | |
+ print("\t\t<title><![CDATA[%s]]></title>" % (ptitle)) | |
+ print("\t\t<link href=\"%s\" />" % (plink)) | |
+ print("\t\t<author><name>%s</name></author>" % (pauthor)) | |
+ print("\t\t<updated>%s</updated>" % (pupdated.isoformat())) | |
+ print("\t\t<content><![CDATA[%s]]></content>" % (pcontent)) | |
+ print("\t</entry>") | |
+ | |
+ print("</feed>") | |
+ | |
+ return 0 | |
+ | |
+if __name__ == "__main__": | |
+ sys.exit(main(sys.argv)) | |
+ |