Introduction
Introduction Statistics Contact Development Disclaimer Help
eomyidae - eomyidae - a gopher crawler software
git clone git://bitreich.org/eomyidae
Log
Files
Refs
Tags
README
LICENSE
---
eomyidae (15243B)
---
1 #!/usr/bin/env python
2 # coding=utf-8
3 #
4 # See the LICENSE file for details.
5 #
6
7 import os
8 import sys
9 import getopt
10 import urllib.parse
11 import socket
12 import io
13 import pickle
14 import time
15 import hashlib
16 import errno
17 import random
18 import operator
19 import math
20 from multiprocessing import Pool
21 from datetime import datetime
22 from datetime import timedelta
23
24 def parseuri(uri):
25 urls = urllib.parse.urlparse(uri, allow_fragments=False)
26 if ":" in urls.netloc:
27 (host, port) = urls.netloc.split(":")[:2]
28 else:
29 host = urls.netloc
30 port = 70
31
32 mtype = "1"
33 if len(urls.path) > 1:
34 mtype = urls.path[1]
35
36 if len(urls.path) > 2:
37 if len(urls.query) > 0:
38 selector = "%s?%s" % (urls.path[2:], urls.query)
39 else:
40 selector = urls.path[2:]
41 else:
42 selector = ""
43
44 return (host, port, mtype, selector)
45
46 def poolgopher(req):
47 data = gopher(req[0], req[1], req[2], req[3])
48 req.append(data)
49 return req
50
51 def gopher(uri=None, host=None, port=70, selector=""):
52 #print("gopher(uri = %s, host = %s, port = %d, selector = %s)" %…
53 # (uri, host, port, selector))
54 if uri != None:
55 (host, port, mtype, selector) = parseuri(uri)
56 port = int(port)
57
58 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
59 s.settimeout(20)
60 try:
61 s.connect((host, port))
62 except socket.gaierror:
63 return ""
64 except socket.timeout:
65 return ""
66 except TimeoutError:
67 return ""
68 except ConnectionResetError:
69 return ""
70 except OverflowError:
71 return ""
72 except OSError as e:
73 # No route to host.
74 if e.errno == 113:
75 return ""
76
77 try:
78 s.send(("%s\r\n" % (selector)).encode("utf-8"))
79 except BrokenPipeError:
80 return ""
81
82 fd = s.makefile("b")
83 try:
84 data = fd.read()
85 except socket.timeout:
86 fd.close()
87 return ""
88 except ConnectionResetError:
89 fd.close()
90 return ""
91 fd.close()
92
93 try:
94 content = data.decode(errors='replace')
95 except UnicodeDecodeError:
96 content = data.decode("iso-8859-1")
97
98 return content
99
100 def parsemenu(data):
101 menu = []
102 lines = data.split("\n")
103 for line in lines:
104 line = line.strip()
105 if len(line) < 1:
106 continue
107
108 mtype = line[0]
109
110 # Last entry
111 if mtype == ".":
112 break
113
114 elements = line[1:].split("\t")
115 if len(elements) < 4:
116 continue
117 (description, selector, host, port) = elements[:4]
118 menu.append([mtype, description, selector, host, port])
119
120 return menu
121
122 def menu2text(menu):
123 text = ""
124 for entry in menu:
125 if type(entry[1]) != str:
126 continue
127
128 text += "%s\n" % (entry[1])
129
130 return text
131
132 ## Robots.txt
133 # https://en.wikipedia.org/wiki/Robots.txt
134 # # Comment
135 # User-agent: somebot
136 # Disallow: /path
137 # Allow: /path
138 # Crawl-delay: seconds
139 def parserobots(data):
140 robots = []
141 lines = data.split("\n")
142 for line in lines:
143 line = line.strip()
144 if "#" in line:
145 (line, comment) = line.split("#", 1)
146 if len(line) < 0:
147 # Empty line, needed for bot-specific rules.
148 robots.append(["",""])
149 continue
150 if not ":" in line:
151 continue
152
153 (header, value) = line.strip().split(":", 1)
154 value = value.strip().lower()
155 header = header.strip().lower()
156 robots.append([header, value])
157 return robots
158
159 def adaptrobots(robotsdata):
160 filterlines = {}
161 robotslines = parserobots(robotsdata)
162 i = 0
163
164 allowlines = []
165 disallowlines = []
166 otherlines = []
167 iseomyidae = False
168 while i < len(robotslines):
169 header = robotslines[i][0].lower()
170 value = robotslines[i][1]
171 if header == "user-agent":
172 ua = value.split("/")
173 if ua[0] == "eomyidae" or ua[0] == "*":
174 iseomyidae = 1
175 else:
176 iseomyidae = 0
177 elif header == "allow" and iseomyidae == True:
178 allowlines.append(value)
179 elif header == "disallow" and iseomyidae == True:
180 disallowlines.append(value)
181 elif header == "":
182 iseomyidae = False
183 else:
184 if iseomyidae == True:
185 otherlines.append([header, value])
186 i += 1
187
188 filterlines["allow"] = allowlines
189 filterlines["disallow"] = disallowlines
190 filterlines["other"] = otherlines
191 if len(allowlines) > 0 or len(disallowlines) > 0 \
192 or len(otherlines) > 0:
193 filterlines["empty"] = False
194 else:
195 filterlines["empty"] = True
196
197 return filterlines
198
199 def mkpath(cachepath):
200 try:
201 os.makedirs(cachepath)
202 except OSError as e:
203 if e.errno != errno.EEXIST:
204 raise
205
206 def mkopen(cachefile):
207 if not os.path.exists(cachefile):
208 fd = open(cachefile, "xb")
209 else:
210 fd = open(cachefile, "wb")
211 return fd
212
213 def informserveradmin(uri, host=None, port=70):
214 if host == None:
215 (host, port, mtype, selector) = parseuri(uri)
216 port = int(port)
217
218 # We are nice and inform before every robots.txt, how to contact…
219 gopher(host=host, port=port, selector="This is eomyidae, your "
220 "friendly crawler. See "
221 "gopher://gopherproject.org/1/eomyidae for "
222 "more info. Have a nice day!")
223
224 def cacherobots(cachedir, uri, host=None, port=70, force=False, \
225 filtercache=None):
226 if host == None:
227 (host, port, mtype, selector) = parseuri(uri)
228 port = int(port)
229
230 if filtercache != None and host in filtercache:
231 #print("Got filterlines from memory filtercache.")
232 return filtercache[host]
233
234 print("Getting robots for %s:%d" % (host, port))
235
236 cachepath = "%s/%s:%d" % (cachedir, host, port)
237 mkpath(cachepath)
238
239 cacherobotstxt = "%s/robots.txt" % (cachepath)
240 cacherobotspickle = "%s/robots.pickle" % (cachepath)
241 filterlines = {}
242 if not os.path.exists(cacherobotstxt) or force == True:
243 # Be nice.
244 informserveradmin(uri=uri, host=host, port=port)
245
246 robotsdata = gopher(host=host, port=port, selector="/rob…
247 print("Got new robots.txt.")
248 print(robotsdata)
249 robotstxtfd = mkopen(cacherobotstxt)
250 robotstxtfd.write(robotsdata.encode())
251 robotstxtfd.close()
252
253 filterlines = adaptrobots(robotsdata)
254 # Do not store if there is nothing, so we save I/O later.
255 if filterlines["empty"] == False:
256 print("Storing filterlines.")
257 storelistdb(cacherobotspickle, filterlines)
258
259 else:
260 if os.path.exists(cacherobotspickle):
261 #print("Loading filterlines from cache.")
262 filterlines = loadlistdb(cacherobotspickle)
263 else:
264 #print("No filterlines available in cache.")
265 filterlines["empty"] = True
266
267 #print(filterlines)
268 if filtercache != None:
269 filtercache[host] = filterlines
270
271 return filterlines
272
273 def selectorisallowed(filterlines, selector):
274 if filterlines["empty"] == True:
275 return True
276
277 def robotsmatch(pattern, selector):
278 #print("pattern = %s, selector = %s" % (pattern, selecto…
279 if pattern == '*':
280 #print("Just start match.")
281 return True
282 elif pattern[0] == '*':
283 #print("Begins with star.")
284 if pattern[-1] == '*':
285 #print("Begins and ends with star.")
286 if pattern[1:-1] in selector:
287 #print("Matches.")
288 return True
289 else:
290 return False
291 else:
292 return selector.endswith(pattern[1:])
293 elif pattern[-1] == '*':
294 #print("Ends with star.")
295 return selector.startswith(pattern[:-1])
296 else:
297 return selector.startswith(pattern)
298
299 isallowed = True
300 for line in filterlines["disallow"]:
301 # TODO: Should this be match everything?
302 if len(line) == 0:
303 continue
304 if robotsmatch(line, selector) == True:
305 #print("isallowed = False")
306 isallowed = False
307 for line in filterlines["allow"]:
308 # TODO: Should this be match everything?
309 if len(line) == 0:
310 continue
311 if robotsmatch(line, selector) == True:
312 #print("isallowed = True")
313 isallowed = True
314
315 #print("isallowed = %d" % (isallowed))
316 return isallowed
317
318 def loadselectorstxt(filename):
319 selectors = []
320
321 if os.path.exists(filename):
322 fd = open(filename, "r")
323 for line in fd:
324 fields = line.split("|")
325 selectors.append(fields)
326 fd.close()
327
328 return selectors
329
330 def loadlist(filename):
331 listelems = []
332
333 if os.path.exists(filename):
334 fd = open(filename, "r")
335 for line in fd:
336 line = line.strip()
337 if len(line) == 0:
338 continue
339 if line[0] == "#":
340 continue
341 listelems.append(line)
342 fd.close()
343
344 return listelems
345
346 def loadlistdb(filename):
347 listelems = []
348
349 if os.path.exists(filename):
350 fd = open(filename, "rb")
351 try:
352 listelems = pickle.load(fd)
353 except EOFError:
354 return []
355 fd.close()
356
357 return listelems
358
359 def storelistdb(filename, listelems):
360 fd = mkopen(filename)
361 pickle.dump(listelems, fd)
362 fd.close()
363
364 def storerawdata(cachedir, uri, data, host=None, port=70):
365 if host == None:
366 (host, port, mtype, selector) = parseuri(uri)
367 port = int(port)
368
369 cachepath = "%s/%s:%s" % (cachedir, host, port)
370 mkpath(cachepath)
371
372 m = hashlib.sha256()
373 m.update(uri.encode())
374 urihash = m.hexdigest()
375
376 cachepath = "%s/%s.menu" % (cachepath, urihash)
377 fd = mkopen(cachepath)
378 #print("Storing %s at %s" % (uri, cachepath))
379 fd.write(("%s\n" % (uri)).encode())
380 fd.write(data.encode())
381 fd.close()
382
383 def usage(app):
384 app = os.path.basename(app)
385 print("usage: %s [-hor] [-b base] [-f blocklist] [-w n] [startur…
386 sys.exit(1)
387
388 def main(args):
389 try:
390 opts, largs = getopt.getopt(args[1:], "hb:f:ow:r")
391 except getopt.GetoptError as err:
392 print(str(err))
393 usage(args[0])
394
395 blocklistfile = None
396 blocklist = []
397
398 base = "."
399 starturi = None
400 workernum = 1
401 robotscache = {}
402 forcehostscount = False
403 for o, a in opts:
404 if o == "-h":
405 usage(args[0])
406 elif o == "-b":
407 base = a
408 elif o == "-f":
409 blocklistfile = a
410 blocklist = loadlist(blocklistfile)
411 print("blocklist: %s" % (blocklist))
412 elif o == "-o":
413 forcehostscount = True
414 elif o == "-r":
415 # Do not cache robots.txt in memory.
416 robotscache = None
417 elif o == "-w":
418 try:
419 workernum = int(a)
420 except ValueError:
421 workernum = 1
422 else:
423 assert False, "unhandled option"
424
425 os.chdir(base)
426 cachedir = "%s/cache" % (base)
427
428 if len(largs) > 0:
429 starturi = largs[0]
430
431 knownuris = loadlistdb("knownuris.pickle")
432 if knownuris == []:
433 knownuris = {}
434 lastlenknownuris = len(knownuris)
435
436 def isblocked(uri):
437 for rule in blocklist:
438 if uri.startswith(rule):
439 return True
440 return False
441
442 def addhostscount(host):
443 if host in hostscount:
444 hostscount[host] += 1
445 else:
446 hostscount[host] = 1
447
448 def subhostscount(host):
449 if host in hostscount:
450 hostscount[host] -= 1
451 if hostscount[host] <= 0:
452 del hostscount[host]
453
454 def addhostscache(uri, host=None, port=70, selector="/"):
455 if uri != None and host == None:
456 (host, port, mtype, selector) = parseuri(uri)
457 port = int(port)
458 else:
459 try:
460 port = int(port)
461 except ValueError:
462 return
463
464 if uri in knownuris:
465 print("ignored for queue: %s" % (uri))
466 return
467 if host == "":
468 print("ignored for queue: %s" % (uri))
469 return
470 if isblocked(uri):
471 print("blocked by filters: %s" % (uri))
472 return
473
474 addhostscount(host)
475
476 if not host in hostscache:
477 hostscache[host] = {}
478 if not "queue" in hostscache[host]:
479 hostscache[host]["queue"] = {}
480
481 filterrules = cacherobots(cachedir, uri, \
482 host=host, \
483 port=port, \
484 filtercache=robotscache)
485 if selectorisallowed(filterrules, selector) == True:
486 hostscache[host]["queue"][uri] = None
487 print("pushed to queue: %s" % (uri))
488 else:
489 pass
490 print("blocked by robots: %s" % (uri))
491
492 def getqueuelen():
493 queuelen = 0
494 for host in hostscache:
495 queuelen += len(hostscache[host]["queue"])
496 return queuelen
497
498 hostscache = loadlistdb("hostscache.pickle")
499 if hostscache == []:
500 hostscache = {}
501 hostscount = loadlistdb("hostscount.pickle")
502 if hostscount == [] or forcehostscount == True:
503 hostscount = {}
504 for host in list(hostscache.keys()):
505 print("host = %s, queuelen = %d" \
506 % (host, \
507 len(hostscache[host]["queue"]…
508 if len(hostscache[host]["queue"]) == 0:
509 del hostscache[host]
510 continue
511 for uri in hostscache[host]["queue"]:
512 (host, port, mtype, selector) = parseuri…
513 addhostscount(host)
514
515 def storestate():
516 if blocklistfile != None:
517 blocklist = loadlist(blocklistfile)
518 if len(blocklist) > 0:
519 print("blocklist: %s" % (blocklist))
520 print("################## Storing state to disc.")
521 storelistdb("knownuris.pickle", knownuris)
522 storelistdb("hostscache.pickle", hostscache)
523 storelistdb("hostscount.pickle", hostscount)
524 print("################## Storing state to disc done.")
525
526 jobs = []
527 if starturi != None:
528 #print("starturi = %s" % (starturi))
529 if not isblocked(starturi):
530 (starthost, startport, startmtype, startselector…
531 addhostscache(starturi, \
532 selector=startselector, \
533 host=starthost, \
534 port=startport)
535 try:
536 jobs.append([starturi, starthost, int(st…
537 except ValueError:
538 # Please fix your URI.
539 pass
540
541 # Store state keeper.
542 startnow = datetime.now()
543 storedelta = timedelta(seconds=10) # 30 seconds
544
545 lastlenknownhosts = len(hostscache)
546 lastlenuriqueue = getqueuelen()
547 while lastlenuriqueue > 0:
548 if len(jobs) < workernum:
549 for host in list(hostscache.keys()):
550 if len(hostscache[host]["queue"]) == 0:
551 del hostscache[host]
552 if host in hostscount:
553 del hostscount[host]
554
555 selhosts = sorted(hostscount.items(), \
556 key=operator.itemgetter(1))[:wor…
557
558 # Give hosts with many selectors more jobs.
559 hostjobs = {}
560 for selhost in selhosts:
561 # 10 ** x
562 hostjobs[selhost[0]] = \
563 math.floor(math.log10(selhost[1]…
564 if hostjobs[selhost[0]] == 0:
565 hostjobs[selhost[0]] = 1
566 print("Queue Status: %s" % (hostjobs))
567
568 for selhost in selhosts:
569 selhost = selhost[0]
570 seluris = hostscache[selhost]["queue"]
571 while hostjobs[selhost] > 0:
572 if len(seluris) == 0:
573 break
574 jobitem = seluris.popitem()
575 if isblocked(jobitem[0]):
576 continue
577 (host, port, mtype, selector) = …
578 job = [jobitem[0], host, port, s…
579 if job not in jobs:
580 jobs.append([jobitem[0],…
581 hostjobs[selhost] -= 1
582
583 print("Getting %d jobs." % (len(jobs)))
584
585 dataresults = []
586 with Pool(processes=workernum) as pool:
587 dataresults = pool.map(poolgopher, jobs)
588 #data = gopher(host=host, port=port, selector=se…
589 jobs = []
590
591 for dataresult in dataresults:
592 (cururi, host, port, selector, data) = dataresult
593 subhostscount(host)
594 storerawdata(cachedir, cururi, data, host=host, …
595 menudata = parsemenu(data)
596 #print(menudata)
597 for mi in menudata:
598 # Only menus so far.
599 if mi[0] == "1":
600 # Fix menu items with ports in h…
601 if ":" in mi[3]:
602 mi[3] = mi[3].split(":")…
603
604 guri = "gopher://%s:%s/%s%s" % \
605 (mi[3], mi[4], m…
606
607 addhostscache(guri, host=mi[3], \
608 port=mi[4], \
609 selector=mi[2])
610
611 print("Uri %s done." % (cururi))
612 knownuris[cururi] = None
613
614 lenuriqueue = getqueuelen()
615 lenknownuris = len(knownuris)
616 lenknownhosts = len(hostscache)
617 print("> queue hosts = %d (%d) %s" % \
618 (lenknownhosts, lenknownhosts -
619 lastlenknownhosts, hostscache.ke…
620 print("> uri queue len = %d (%d)" % \
621 (lenuriqueue, lenuriqueue - lastlenuriqu…
622 print("> visited uris = %d (%d)" % \
623 (lenknownuris, lenknownuris - lastlenkno…
624 lastlenknownuris = lenknownuris
625 lastlenuriqueue = lenuriqueue
626 lastlenknownhosts = lenknownhosts
627
628 # TODO: Remove after debugging
629 nowdelta = datetime.now() - startnow
630 if nowdelta >= storedelta:
631 storestate()
632 startnow = datetime.now()
633
634 time.sleep(0.2) # don't be too harsh on servers
635
636 #break #oneshot
637
638 # Save at end of even single shot.
639 storestate()
640
641 return 0
642
643 if __name__ == "__main__":
644 sys.exit(main(sys.argv))
645
You are viewing proxied material from bitreich.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.