eomyidae - eomyidae - a gopher crawler software | |
git clone git://bitreich.org/eomyidae | |
Log | |
Files | |
Refs | |
Tags | |
README | |
LICENSE | |
--- | |
eomyidae (15243B) | |
--- | |
1 #!/usr/bin/env python | |
2 # coding=utf-8 | |
3 # | |
4 # See the LICENSE file for details. | |
5 # | |
6 | |
7 import os | |
8 import sys | |
9 import getopt | |
10 import urllib.parse | |
11 import socket | |
12 import io | |
13 import pickle | |
14 import time | |
15 import hashlib | |
16 import errno | |
17 import random | |
18 import operator | |
19 import math | |
20 from multiprocessing import Pool | |
21 from datetime import datetime | |
22 from datetime import timedelta | |
23 | |
24 def parseuri(uri): | |
25 urls = urllib.parse.urlparse(uri, allow_fragments=False) | |
26 if ":" in urls.netloc: | |
27 (host, port) = urls.netloc.split(":")[:2] | |
28 else: | |
29 host = urls.netloc | |
30 port = 70 | |
31 | |
32 mtype = "1" | |
33 if len(urls.path) > 1: | |
34 mtype = urls.path[1] | |
35 | |
36 if len(urls.path) > 2: | |
37 if len(urls.query) > 0: | |
38 selector = "%s?%s" % (urls.path[2:], urls.query) | |
39 else: | |
40 selector = urls.path[2:] | |
41 else: | |
42 selector = "" | |
43 | |
44 return (host, port, mtype, selector) | |
45 | |
46 def poolgopher(req): | |
47 data = gopher(req[0], req[1], req[2], req[3]) | |
48 req.append(data) | |
49 return req | |
50 | |
51 def gopher(uri=None, host=None, port=70, selector=""): | |
52 #print("gopher(uri = %s, host = %s, port = %d, selector = %s)" %… | |
53 # (uri, host, port, selector)) | |
54 if uri != None: | |
55 (host, port, mtype, selector) = parseuri(uri) | |
56 port = int(port) | |
57 | |
58 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | |
59 s.settimeout(20) | |
60 try: | |
61 s.connect((host, port)) | |
62 except socket.gaierror: | |
63 return "" | |
64 except socket.timeout: | |
65 return "" | |
66 except TimeoutError: | |
67 return "" | |
68 except ConnectionResetError: | |
69 return "" | |
70 except OverflowError: | |
71 return "" | |
72 except OSError as e: | |
73 # No route to host. | |
74 if e.errno == 113: | |
75 return "" | |
76 | |
77 try: | |
78 s.send(("%s\r\n" % (selector)).encode("utf-8")) | |
79 except BrokenPipeError: | |
80 return "" | |
81 | |
82 fd = s.makefile("b") | |
83 try: | |
84 data = fd.read() | |
85 except socket.timeout: | |
86 fd.close() | |
87 return "" | |
88 except ConnectionResetError: | |
89 fd.close() | |
90 return "" | |
91 fd.close() | |
92 | |
93 try: | |
94 content = data.decode(errors='replace') | |
95 except UnicodeDecodeError: | |
96 content = data.decode("iso-8859-1") | |
97 | |
98 return content | |
99 | |
100 def parsemenu(data): | |
101 menu = [] | |
102 lines = data.split("\n") | |
103 for line in lines: | |
104 line = line.strip() | |
105 if len(line) < 1: | |
106 continue | |
107 | |
108 mtype = line[0] | |
109 | |
110 # Last entry | |
111 if mtype == ".": | |
112 break | |
113 | |
114 elements = line[1:].split("\t") | |
115 if len(elements) < 4: | |
116 continue | |
117 (description, selector, host, port) = elements[:4] | |
118 menu.append([mtype, description, selector, host, port]) | |
119 | |
120 return menu | |
121 | |
122 def menu2text(menu): | |
123 text = "" | |
124 for entry in menu: | |
125 if type(entry[1]) != str: | |
126 continue | |
127 | |
128 text += "%s\n" % (entry[1]) | |
129 | |
130 return text | |
131 | |
132 ## Robots.txt | |
133 # https://en.wikipedia.org/wiki/Robots.txt | |
134 # # Comment | |
135 # User-agent: somebot | |
136 # Disallow: /path | |
137 # Allow: /path | |
138 # Crawl-delay: seconds | |
139 def parserobots(data): | |
140 robots = [] | |
141 lines = data.split("\n") | |
142 for line in lines: | |
143 line = line.strip() | |
144 if "#" in line: | |
145 (line, comment) = line.split("#", 1) | |
146 if len(line) < 0: | |
147 # Empty line, needed for bot-specific rules. | |
148 robots.append(["",""]) | |
149 continue | |
150 if not ":" in line: | |
151 continue | |
152 | |
153 (header, value) = line.strip().split(":", 1) | |
154 value = value.strip().lower() | |
155 header = header.strip().lower() | |
156 robots.append([header, value]) | |
157 return robots | |
158 | |
159 def adaptrobots(robotsdata): | |
160 filterlines = {} | |
161 robotslines = parserobots(robotsdata) | |
162 i = 0 | |
163 | |
164 allowlines = [] | |
165 disallowlines = [] | |
166 otherlines = [] | |
167 iseomyidae = False | |
168 while i < len(robotslines): | |
169 header = robotslines[i][0].lower() | |
170 value = robotslines[i][1] | |
171 if header == "user-agent": | |
172 ua = value.split("/") | |
173 if ua[0] == "eomyidae" or ua[0] == "*": | |
174 iseomyidae = 1 | |
175 else: | |
176 iseomyidae = 0 | |
177 elif header == "allow" and iseomyidae == True: | |
178 allowlines.append(value) | |
179 elif header == "disallow" and iseomyidae == True: | |
180 disallowlines.append(value) | |
181 elif header == "": | |
182 iseomyidae = False | |
183 else: | |
184 if iseomyidae == True: | |
185 otherlines.append([header, value]) | |
186 i += 1 | |
187 | |
188 filterlines["allow"] = allowlines | |
189 filterlines["disallow"] = disallowlines | |
190 filterlines["other"] = otherlines | |
191 if len(allowlines) > 0 or len(disallowlines) > 0 \ | |
192 or len(otherlines) > 0: | |
193 filterlines["empty"] = False | |
194 else: | |
195 filterlines["empty"] = True | |
196 | |
197 return filterlines | |
198 | |
199 def mkpath(cachepath): | |
200 try: | |
201 os.makedirs(cachepath) | |
202 except OSError as e: | |
203 if e.errno != errno.EEXIST: | |
204 raise | |
205 | |
206 def mkopen(cachefile): | |
207 if not os.path.exists(cachefile): | |
208 fd = open(cachefile, "xb") | |
209 else: | |
210 fd = open(cachefile, "wb") | |
211 return fd | |
212 | |
213 def informserveradmin(uri, host=None, port=70): | |
214 if host == None: | |
215 (host, port, mtype, selector) = parseuri(uri) | |
216 port = int(port) | |
217 | |
218 # We are nice and inform before every robots.txt, how to contact… | |
219 gopher(host=host, port=port, selector="This is eomyidae, your " | |
220 "friendly crawler. See " | |
221 "gopher://gopherproject.org/1/eomyidae for " | |
222 "more info. Have a nice day!") | |
223 | |
224 def cacherobots(cachedir, uri, host=None, port=70, force=False, \ | |
225 filtercache=None): | |
226 if host == None: | |
227 (host, port, mtype, selector) = parseuri(uri) | |
228 port = int(port) | |
229 | |
230 if filtercache != None and host in filtercache: | |
231 #print("Got filterlines from memory filtercache.") | |
232 return filtercache[host] | |
233 | |
234 print("Getting robots for %s:%d" % (host, port)) | |
235 | |
236 cachepath = "%s/%s:%d" % (cachedir, host, port) | |
237 mkpath(cachepath) | |
238 | |
239 cacherobotstxt = "%s/robots.txt" % (cachepath) | |
240 cacherobotspickle = "%s/robots.pickle" % (cachepath) | |
241 filterlines = {} | |
242 if not os.path.exists(cacherobotstxt) or force == True: | |
243 # Be nice. | |
244 informserveradmin(uri=uri, host=host, port=port) | |
245 | |
246 robotsdata = gopher(host=host, port=port, selector="/rob… | |
247 print("Got new robots.txt.") | |
248 print(robotsdata) | |
249 robotstxtfd = mkopen(cacherobotstxt) | |
250 robotstxtfd.write(robotsdata.encode()) | |
251 robotstxtfd.close() | |
252 | |
253 filterlines = adaptrobots(robotsdata) | |
254 # Do not store if there is nothing, so we save I/O later. | |
255 if filterlines["empty"] == False: | |
256 print("Storing filterlines.") | |
257 storelistdb(cacherobotspickle, filterlines) | |
258 | |
259 else: | |
260 if os.path.exists(cacherobotspickle): | |
261 #print("Loading filterlines from cache.") | |
262 filterlines = loadlistdb(cacherobotspickle) | |
263 else: | |
264 #print("No filterlines available in cache.") | |
265 filterlines["empty"] = True | |
266 | |
267 #print(filterlines) | |
268 if filtercache != None: | |
269 filtercache[host] = filterlines | |
270 | |
271 return filterlines | |
272 | |
273 def selectorisallowed(filterlines, selector): | |
274 if filterlines["empty"] == True: | |
275 return True | |
276 | |
277 def robotsmatch(pattern, selector): | |
278 #print("pattern = %s, selector = %s" % (pattern, selecto… | |
279 if pattern == '*': | |
280 #print("Just start match.") | |
281 return True | |
282 elif pattern[0] == '*': | |
283 #print("Begins with star.") | |
284 if pattern[-1] == '*': | |
285 #print("Begins and ends with star.") | |
286 if pattern[1:-1] in selector: | |
287 #print("Matches.") | |
288 return True | |
289 else: | |
290 return False | |
291 else: | |
292 return selector.endswith(pattern[1:]) | |
293 elif pattern[-1] == '*': | |
294 #print("Ends with star.") | |
295 return selector.startswith(pattern[:-1]) | |
296 else: | |
297 return selector.startswith(pattern) | |
298 | |
299 isallowed = True | |
300 for line in filterlines["disallow"]: | |
301 # TODO: Should this be match everything? | |
302 if len(line) == 0: | |
303 continue | |
304 if robotsmatch(line, selector) == True: | |
305 #print("isallowed = False") | |
306 isallowed = False | |
307 for line in filterlines["allow"]: | |
308 # TODO: Should this be match everything? | |
309 if len(line) == 0: | |
310 continue | |
311 if robotsmatch(line, selector) == True: | |
312 #print("isallowed = True") | |
313 isallowed = True | |
314 | |
315 #print("isallowed = %d" % (isallowed)) | |
316 return isallowed | |
317 | |
318 def loadselectorstxt(filename): | |
319 selectors = [] | |
320 | |
321 if os.path.exists(filename): | |
322 fd = open(filename, "r") | |
323 for line in fd: | |
324 fields = line.split("|") | |
325 selectors.append(fields) | |
326 fd.close() | |
327 | |
328 return selectors | |
329 | |
330 def loadlist(filename): | |
331 listelems = [] | |
332 | |
333 if os.path.exists(filename): | |
334 fd = open(filename, "r") | |
335 for line in fd: | |
336 line = line.strip() | |
337 if len(line) == 0: | |
338 continue | |
339 if line[0] == "#": | |
340 continue | |
341 listelems.append(line) | |
342 fd.close() | |
343 | |
344 return listelems | |
345 | |
346 def loadlistdb(filename): | |
347 listelems = [] | |
348 | |
349 if os.path.exists(filename): | |
350 fd = open(filename, "rb") | |
351 try: | |
352 listelems = pickle.load(fd) | |
353 except EOFError: | |
354 return [] | |
355 fd.close() | |
356 | |
357 return listelems | |
358 | |
359 def storelistdb(filename, listelems): | |
360 fd = mkopen(filename) | |
361 pickle.dump(listelems, fd) | |
362 fd.close() | |
363 | |
364 def storerawdata(cachedir, uri, data, host=None, port=70): | |
365 if host == None: | |
366 (host, port, mtype, selector) = parseuri(uri) | |
367 port = int(port) | |
368 | |
369 cachepath = "%s/%s:%s" % (cachedir, host, port) | |
370 mkpath(cachepath) | |
371 | |
372 m = hashlib.sha256() | |
373 m.update(uri.encode()) | |
374 urihash = m.hexdigest() | |
375 | |
376 cachepath = "%s/%s.menu" % (cachepath, urihash) | |
377 fd = mkopen(cachepath) | |
378 #print("Storing %s at %s" % (uri, cachepath)) | |
379 fd.write(("%s\n" % (uri)).encode()) | |
380 fd.write(data.encode()) | |
381 fd.close() | |
382 | |
383 def usage(app): | |
384 app = os.path.basename(app) | |
385 print("usage: %s [-hor] [-b base] [-f blocklist] [-w n] [startur… | |
386 sys.exit(1) | |
387 | |
388 def main(args): | |
389 try: | |
390 opts, largs = getopt.getopt(args[1:], "hb:f:ow:r") | |
391 except getopt.GetoptError as err: | |
392 print(str(err)) | |
393 usage(args[0]) | |
394 | |
395 blocklistfile = None | |
396 blocklist = [] | |
397 | |
398 base = "." | |
399 starturi = None | |
400 workernum = 1 | |
401 robotscache = {} | |
402 forcehostscount = False | |
403 for o, a in opts: | |
404 if o == "-h": | |
405 usage(args[0]) | |
406 elif o == "-b": | |
407 base = a | |
408 elif o == "-f": | |
409 blocklistfile = a | |
410 blocklist = loadlist(blocklistfile) | |
411 print("blocklist: %s" % (blocklist)) | |
412 elif o == "-o": | |
413 forcehostscount = True | |
414 elif o == "-r": | |
415 # Do not cache robots.txt in memory. | |
416 robotscache = None | |
417 elif o == "-w": | |
418 try: | |
419 workernum = int(a) | |
420 except ValueError: | |
421 workernum = 1 | |
422 else: | |
423 assert False, "unhandled option" | |
424 | |
425 os.chdir(base) | |
426 cachedir = "%s/cache" % (base) | |
427 | |
428 if len(largs) > 0: | |
429 starturi = largs[0] | |
430 | |
431 knownuris = loadlistdb("knownuris.pickle") | |
432 if knownuris == []: | |
433 knownuris = {} | |
434 lastlenknownuris = len(knownuris) | |
435 | |
436 def isblocked(uri): | |
437 for rule in blocklist: | |
438 if uri.startswith(rule): | |
439 return True | |
440 return False | |
441 | |
442 def addhostscount(host): | |
443 if host in hostscount: | |
444 hostscount[host] += 1 | |
445 else: | |
446 hostscount[host] = 1 | |
447 | |
448 def subhostscount(host): | |
449 if host in hostscount: | |
450 hostscount[host] -= 1 | |
451 if hostscount[host] <= 0: | |
452 del hostscount[host] | |
453 | |
454 def addhostscache(uri, host=None, port=70, selector="/"): | |
455 if uri != None and host == None: | |
456 (host, port, mtype, selector) = parseuri(uri) | |
457 port = int(port) | |
458 else: | |
459 try: | |
460 port = int(port) | |
461 except ValueError: | |
462 return | |
463 | |
464 if uri in knownuris: | |
465 print("ignored for queue: %s" % (uri)) | |
466 return | |
467 if host == "": | |
468 print("ignored for queue: %s" % (uri)) | |
469 return | |
470 if isblocked(uri): | |
471 print("blocked by filters: %s" % (uri)) | |
472 return | |
473 | |
474 addhostscount(host) | |
475 | |
476 if not host in hostscache: | |
477 hostscache[host] = {} | |
478 if not "queue" in hostscache[host]: | |
479 hostscache[host]["queue"] = {} | |
480 | |
481 filterrules = cacherobots(cachedir, uri, \ | |
482 host=host, \ | |
483 port=port, \ | |
484 filtercache=robotscache) | |
485 if selectorisallowed(filterrules, selector) == True: | |
486 hostscache[host]["queue"][uri] = None | |
487 print("pushed to queue: %s" % (uri)) | |
488 else: | |
489 pass | |
490 print("blocked by robots: %s" % (uri)) | |
491 | |
492 def getqueuelen(): | |
493 queuelen = 0 | |
494 for host in hostscache: | |
495 queuelen += len(hostscache[host]["queue"]) | |
496 return queuelen | |
497 | |
498 hostscache = loadlistdb("hostscache.pickle") | |
499 if hostscache == []: | |
500 hostscache = {} | |
501 hostscount = loadlistdb("hostscount.pickle") | |
502 if hostscount == [] or forcehostscount == True: | |
503 hostscount = {} | |
504 for host in list(hostscache.keys()): | |
505 print("host = %s, queuelen = %d" \ | |
506 % (host, \ | |
507 len(hostscache[host]["queue"]… | |
508 if len(hostscache[host]["queue"]) == 0: | |
509 del hostscache[host] | |
510 continue | |
511 for uri in hostscache[host]["queue"]: | |
512 (host, port, mtype, selector) = parseuri… | |
513 addhostscount(host) | |
514 | |
515 def storestate(): | |
516 if blocklistfile != None: | |
517 blocklist = loadlist(blocklistfile) | |
518 if len(blocklist) > 0: | |
519 print("blocklist: %s" % (blocklist)) | |
520 print("################## Storing state to disc.") | |
521 storelistdb("knownuris.pickle", knownuris) | |
522 storelistdb("hostscache.pickle", hostscache) | |
523 storelistdb("hostscount.pickle", hostscount) | |
524 print("################## Storing state to disc done.") | |
525 | |
526 jobs = [] | |
527 if starturi != None: | |
528 #print("starturi = %s" % (starturi)) | |
529 if not isblocked(starturi): | |
530 (starthost, startport, startmtype, startselector… | |
531 addhostscache(starturi, \ | |
532 selector=startselector, \ | |
533 host=starthost, \ | |
534 port=startport) | |
535 try: | |
536 jobs.append([starturi, starthost, int(st… | |
537 except ValueError: | |
538 # Please fix your URI. | |
539 pass | |
540 | |
541 # Store state keeper. | |
542 startnow = datetime.now() | |
543 storedelta = timedelta(seconds=10) # 30 seconds | |
544 | |
545 lastlenknownhosts = len(hostscache) | |
546 lastlenuriqueue = getqueuelen() | |
547 while lastlenuriqueue > 0: | |
548 if len(jobs) < workernum: | |
549 for host in list(hostscache.keys()): | |
550 if len(hostscache[host]["queue"]) == 0: | |
551 del hostscache[host] | |
552 if host in hostscount: | |
553 del hostscount[host] | |
554 | |
555 selhosts = sorted(hostscount.items(), \ | |
556 key=operator.itemgetter(1))[:wor… | |
557 | |
558 # Give hosts with many selectors more jobs. | |
559 hostjobs = {} | |
560 for selhost in selhosts: | |
561 # 10 ** x | |
562 hostjobs[selhost[0]] = \ | |
563 math.floor(math.log10(selhost[1]… | |
564 if hostjobs[selhost[0]] == 0: | |
565 hostjobs[selhost[0]] = 1 | |
566 print("Queue Status: %s" % (hostjobs)) | |
567 | |
568 for selhost in selhosts: | |
569 selhost = selhost[0] | |
570 seluris = hostscache[selhost]["queue"] | |
571 while hostjobs[selhost] > 0: | |
572 if len(seluris) == 0: | |
573 break | |
574 jobitem = seluris.popitem() | |
575 if isblocked(jobitem[0]): | |
576 continue | |
577 (host, port, mtype, selector) = … | |
578 job = [jobitem[0], host, port, s… | |
579 if job not in jobs: | |
580 jobs.append([jobitem[0],… | |
581 hostjobs[selhost] -= 1 | |
582 | |
583 print("Getting %d jobs." % (len(jobs))) | |
584 | |
585 dataresults = [] | |
586 with Pool(processes=workernum) as pool: | |
587 dataresults = pool.map(poolgopher, jobs) | |
588 #data = gopher(host=host, port=port, selector=se… | |
589 jobs = [] | |
590 | |
591 for dataresult in dataresults: | |
592 (cururi, host, port, selector, data) = dataresult | |
593 subhostscount(host) | |
594 storerawdata(cachedir, cururi, data, host=host, … | |
595 menudata = parsemenu(data) | |
596 #print(menudata) | |
597 for mi in menudata: | |
598 # Only menus so far. | |
599 if mi[0] == "1": | |
600 # Fix menu items with ports in h… | |
601 if ":" in mi[3]: | |
602 mi[3] = mi[3].split(":")… | |
603 | |
604 guri = "gopher://%s:%s/%s%s" % \ | |
605 (mi[3], mi[4], m… | |
606 | |
607 addhostscache(guri, host=mi[3], \ | |
608 port=mi[4], \ | |
609 selector=mi[2]) | |
610 | |
611 print("Uri %s done." % (cururi)) | |
612 knownuris[cururi] = None | |
613 | |
614 lenuriqueue = getqueuelen() | |
615 lenknownuris = len(knownuris) | |
616 lenknownhosts = len(hostscache) | |
617 print("> queue hosts = %d (%d) %s" % \ | |
618 (lenknownhosts, lenknownhosts - | |
619 lastlenknownhosts, hostscache.ke… | |
620 print("> uri queue len = %d (%d)" % \ | |
621 (lenuriqueue, lenuriqueue - lastlenuriqu… | |
622 print("> visited uris = %d (%d)" % \ | |
623 (lenknownuris, lenknownuris - lastlenkno… | |
624 lastlenknownuris = lenknownuris | |
625 lastlenuriqueue = lenuriqueue | |
626 lastlenknownhosts = lenknownhosts | |
627 | |
628 # TODO: Remove after debugging | |
629 nowdelta = datetime.now() - startnow | |
630 if nowdelta >= storedelta: | |
631 storestate() | |
632 startnow = datetime.now() | |
633 | |
634 time.sleep(0.2) # don't be too harsh on servers | |
635 | |
636 #break #oneshot | |
637 | |
638 # Save at end of even single shot. | |
639 storestate() | |
640 | |
641 return 0 | |
642 | |
643 if __name__ == "__main__": | |
644 sys.exit(main(sys.argv)) | |
645 |