GopherProxy

	kvssachsen2atom - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository
	git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fz…
	Log
	Files
	Refs
	Tags
	---
	kvssachsen2atom (3188B)
	---
	1 #!/usr/bin/env python
	2 # coding=utf-8
	3 #
	4 # Copy me if you can.
	5 # by 20h
	6 #
	7
	8 import os
	9 import sys
	10 import getopt
	11
	12 from selenium import webdriver
	13 from selenium.webdriver.chrome.options import Options as chromeoptions
	14 from selenium.webdriver.support.ui import WebDriverWait
	15 from selenium.webdriver.support import expected_conditions as EC
	16 from selenium.webdriver.common.by import By
	17
	18 from datetime import datetime
	19 import pytz
	20
	21 def usage(app):
	22 app = os.path.basename(app)
	23 sys.stderr.write("usage: %s [-h] URI\n" % (app))
	24 sys.exit(1)
	25
	26 def main(args):
	27 try:
	28 opts, largs = getopt.getopt(args[1:], "h")
	29 except getopt.GetoptError as err:
	30 print(str(err))
	31 usage(args[0])
	32
	33 for o, a in opts:
	34 if o == "-h":
	35 usage(args[0])
	36 else:
	37 assert False, "unhandled option"
	38
	39 if len(largs) < 1:
	40 usage(args[0])
	41
	42 link = largs[0]
	43
	44 options = chromeoptions()
	45 chromearguments = [
	46 "headless",
	47 "no-sandbox",
	48 "disable-extensions",
	49 "disable-dev-shm-usage",
	50 "start-maximized",
	51 "window-size=1900,1080",
	52 "disable-gpu"
	53 ]
	54 for carg in chromearguments:
	55 options.add_argument(carg)
	56
	57 driver = webdriver.Chrome(options=options)
	58 driver.get(link)
	59
	60 isnews = WebDriverWait(driver=driver, timeout=60).until(
	61 EC.presence_of_element_located((By.XPATH,
	62 "//div[@data-last-letter]")
	63 )
	64 )
	65 newslist = driver.find_elements(By.XPATH, "//div[@data-filter-ta…
	66
	67 title = driver.find_elements(By.XPATH, "//meta[@property=\"og:ti…
	68 description = title
	69 globaltags = ""
	70
	71 print("""<?xml version="1.0" encoding="utf-8"?>""")
	72 print("""<feed xmlns="http://www.w3.org/2005/Atom">""")
	73 print("\t<title><![CDATA[%s]]></title>" % (title))
	74 print("\t<subtitle><![CDATA[%s]]></subtitle>" % (description))
	75 print("\t<id>%s</id>" % (link))
	76 print("\t<link href=\"%s\" rel=\"self\" />" % (link))
	77 print("\t<link href=\"%s\" />" % (link))
	78
	79 utcnow = datetime.now(pytz.utc)
	80 print("\t<updated>%s</updated>" % (utcnow.isoformat()))
	81
	82 articles = newslist.find_elements(By.XPATH, "./div")
	83 baselink = "/".join(link.split("/", 3)[:-1])
	84 for article in articles[::-1]:
	85 link = article.find_elements(By.XPATH, "./a")[0]
	86 plink = link.get_attribute("href")
	87 if not plink.startswith("http"):
	88 plink = "%s/%s" % (baselink, plink)
	89 ptitle = link.get_attribute("data-title")
	90 pcontent = article.text
	91 pauthor = "[email protected]"
	92
	93 # Normalize datetime.
	94 updateds = article.find_elements(By.XPATH, ".//time")[0]…
	95 try:
	96 dtupdated = datetime.strptime(updateds, "%d.%m.%…
	97 except ValueError:
	98 continue
	99
	100 dtupdated = dtupdated.replace(hour=12, minute=0,\
	101 second=0, tzinfo=pytz.utc)
	102 if dtupdated.year > utcnow.year:
	103 dtupdated = dtupdated.replace(year=utcnow.year)
	104 pupdated = dtupdated
	105
	106 print("\t<entry>")
	107 print("\t\t<id>%s</id>" % (plink))
	108 print("\t\t<title><![CDATA[%s]]></title>" % (ptitle))
	109 print("\t\t<link href=\"%s\" />" % (plink))
	110 print("\t\t<author><name>%s</name></author>" % (pauthor))
	111 print("\t\t<updated>%s</updated>" % (pupdated.isoformat(…
	112 print("\t\t<content><![CDATA[%s]]></content>" % (pconten…
	113 print("\t</entry>")
	114
	115 print("</feed>")
	116
	117 return 0
	118
	119 if __name__ == "__main__":
	120 sys.exit(main(sys.argv))
	121