kvssachsen2atom - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository | |
git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fz… | |
Log | |
Files | |
Refs | |
Tags | |
--- | |
kvssachsen2atom (3188B) | |
--- | |
1 #!/usr/bin/env python | |
2 # coding=utf-8 | |
3 # | |
4 # Copy me if you can. | |
5 # by 20h | |
6 # | |
7 | |
8 import os | |
9 import sys | |
10 import getopt | |
11 | |
12 from selenium import webdriver | |
13 from selenium.webdriver.chrome.options import Options as chromeoptions | |
14 from selenium.webdriver.support.ui import WebDriverWait | |
15 from selenium.webdriver.support import expected_conditions as EC | |
16 from selenium.webdriver.common.by import By | |
17 | |
18 from datetime import datetime | |
19 import pytz | |
20 | |
21 def usage(app): | |
22 app = os.path.basename(app) | |
23 sys.stderr.write("usage: %s [-h] URI\n" % (app)) | |
24 sys.exit(1) | |
25 | |
26 def main(args): | |
27 try: | |
28 opts, largs = getopt.getopt(args[1:], "h") | |
29 except getopt.GetoptError as err: | |
30 print(str(err)) | |
31 usage(args[0]) | |
32 | |
33 for o, a in opts: | |
34 if o == "-h": | |
35 usage(args[0]) | |
36 else: | |
37 assert False, "unhandled option" | |
38 | |
39 if len(largs) < 1: | |
40 usage(args[0]) | |
41 | |
42 link = largs[0] | |
43 | |
44 options = chromeoptions() | |
45 chromearguments = [ | |
46 "headless", | |
47 "no-sandbox", | |
48 "disable-extensions", | |
49 "disable-dev-shm-usage", | |
50 "start-maximized", | |
51 "window-size=1900,1080", | |
52 "disable-gpu" | |
53 ] | |
54 for carg in chromearguments: | |
55 options.add_argument(carg) | |
56 | |
57 driver = webdriver.Chrome(options=options) | |
58 driver.get(link) | |
59 | |
60 isnews = WebDriverWait(driver=driver, timeout=60).until( | |
61 EC.presence_of_element_located((By.XPATH, | |
62 "//div[@data-last-letter]") | |
63 ) | |
64 ) | |
65 newslist = driver.find_elements(By.XPATH, "//div[@data-filter-ta… | |
66 | |
67 title = driver.find_elements(By.XPATH, "//meta[@property=\"og:ti… | |
68 description = title | |
69 globaltags = "" | |
70 | |
71 print("""<?xml version="1.0" encoding="utf-8"?>""") | |
72 print("""<feed xmlns="http://www.w3.org/2005/Atom">""") | |
73 print("\t<title><![CDATA[%s]]></title>" % (title)) | |
74 print("\t<subtitle><![CDATA[%s]]></subtitle>" % (description)) | |
75 print("\t<id>%s</id>" % (link)) | |
76 print("\t<link href=\"%s\" rel=\"self\" />" % (link)) | |
77 print("\t<link href=\"%s\" />" % (link)) | |
78 | |
79 utcnow = datetime.now(pytz.utc) | |
80 print("\t<updated>%s</updated>" % (utcnow.isoformat())) | |
81 | |
82 articles = newslist.find_elements(By.XPATH, "./div") | |
83 baselink = "/".join(link.split("/", 3)[:-1]) | |
84 for article in articles[::-1]: | |
85 link = article.find_elements(By.XPATH, "./a")[0] | |
86 plink = link.get_attribute("href") | |
87 if not plink.startswith("http"): | |
88 plink = "%s/%s" % (baselink, plink) | |
89 ptitle = link.get_attribute("data-title") | |
90 pcontent = article.text | |
91 pauthor = "[email protected]" | |
92 | |
93 # Normalize datetime. | |
94 updateds = article.find_elements(By.XPATH, ".//time")[0]… | |
95 try: | |
96 dtupdated = datetime.strptime(updateds, "%d.%m.%… | |
97 except ValueError: | |
98 continue | |
99 | |
100 dtupdated = dtupdated.replace(hour=12, minute=0,\ | |
101 second=0, tzinfo=pytz.utc) | |
102 if dtupdated.year > utcnow.year: | |
103 dtupdated = dtupdated.replace(year=utcnow.year) | |
104 pupdated = dtupdated | |
105 | |
106 print("\t<entry>") | |
107 print("\t\t<id>%s</id>" % (plink)) | |
108 print("\t\t<title><![CDATA[%s]]></title>" % (ptitle)) | |
109 print("\t\t<link href=\"%s\" />" % (plink)) | |
110 print("\t\t<author><name>%s</name></author>" % (pauthor)) | |
111 print("\t\t<updated>%s</updated>" % (pupdated.isoformat(… | |
112 print("\t\t<content><![CDATA[%s]]></content>" % (pconten… | |
113 print("\t</entry>") | |
114 | |
115 print("</feed>") | |
116 | |
117 return 0 | |
118 | |
119 if __name__ == "__main__": | |
120 sys.exit(main(sys.argv)) | |
121 |