Optimize savehostscache. - eomyidae - a gopher crawler software | |
git clone git://bitreich.org/eomyidae | |
Log | |
Files | |
Refs | |
Tags | |
README | |
LICENSE | |
--- | |
commit 29cd7839e600acdd21378256d73b4703f799f04a | |
parent 0dac4a637d7e25983b563286bb0539d53ddf8d3e | |
Author: Christoph Lohmann <[email protected]> | |
Date: Mon, 12 Aug 2019 11:48:12 +0200 | |
Optimize savehostscache. | |
Diffstat: | |
M eomyidae | 49 ++++++++++++++++++++---------… | |
1 file changed, 32 insertions(+), 17 deletions(-) | |
--- | |
diff --git a/eomyidae b/eomyidae | |
@@ -429,6 +429,8 @@ def main(args): | |
starturi = largs[0] | |
knownuris = loadlistdb("knownuris.pickle") | |
+ if knownuris == []: | |
+ knownuris = {} | |
lastlenknownuris = len(knownuris) | |
def isblocked(uri): | |
@@ -449,38 +451,43 @@ def main(args): | |
if hostscount[host] <= 0: | |
del hostscount[host] | |
- def addhostscache(host, uri, port=70): | |
+ def addhostscache(uri, host=None, port=70, selector="/"): | |
+ if uri != None and host == None: | |
+ (host, port, mtype, selector) = parseuri(uri) | |
+ port = int(port) | |
+ else: | |
+ try: | |
+ port = int(port) | |
+ except ValueError: | |
+ return | |
+ | |
if uri in knownuris: | |
- #print("ignored for queue: %s" % (uri)) | |
+ print("ignored for queue: %s" % (uri)) | |
return | |
if host == "": | |
- #print("ignored for queue: %s" % (uri)) | |
+ print("ignored for queue: %s" % (uri)) | |
return | |
if isblocked(uri): | |
print("blocked by filters: %s" % (uri)) | |
return | |
- try: | |
- port = int(port) | |
- except ValueError: | |
- return | |
- | |
addhostscount(host) | |
+ if not host in hostscache: | |
+ hostscache[host] = {} | |
+ if not "queue" in hostscache[host]: | |
+ hostscache[host]["queue"] = {} | |
+ | |
filterrules = cacherobots(cachedir, uri, \ | |
host=host, \ | |
port=port, \ | |
filtercache=robotscache) | |
if selectorisallowed(filterrules, selector) == True: | |
- if not host in hostscache: | |
- hostscache[host] = {} | |
- if not "queue" in hostscache[host]: | |
- hostscache[host]["queue"] = {} | |
hostscache[host]["queue"][uri] = None | |
- #print("pushed to queue: %s" % (uri)) | |
+ print("pushed to queue: %s" % (uri)) | |
else: | |
pass | |
- #print("blocked by robots: %s" % (uri)) | |
+ print("blocked by robots: %s" % (uri)) | |
def getqueuelen(): | |
queuelen = 0 | |
@@ -518,9 +525,13 @@ def main(args): | |
jobs = [] | |
if starturi != None: | |
+ #print("starturi = %s" % (starturi)) | |
if not isblocked(starturi): | |
(starthost, startport, startmtype, startselector) = pa… | |
- addhostscache(hostscache, starthost, starturi) | |
+ addhostscache(starturi, \ | |
+ selector=startselector, \ | |
+ host=starthost, \ | |
+ port=startport) | |
try: | |
jobs.append([starturi, starthost, int(startpor… | |
except ValueError: | |
@@ -564,7 +575,9 @@ def main(args): | |
if isblocked(jobitem[0]): | |
continue | |
(host, port, mtype, selector) = parseu… | |
- jobs.append([jobitem[0], host, port, s… | |
+ job = [jobitem[0], host, port, selecto… | |
+ if job not in jobs: | |
+ jobs.append([jobitem[0], host,… | |
hostjobs[selhost] -= 1 | |
print("Getting %d jobs." % (len(jobs))) | |
@@ -591,7 +604,9 @@ def main(args): | |
guri = "gopher://%s:%s/%s%s" % \ | |
(mi[3], mi[4], mi[0], … | |
- addhostscache(mi[3], guri, port=mi[4]) | |
+ addhostscache(guri, host=mi[3], \ | |
+ port=mi[4], \ | |
+ selector=mi[2]) | |
print("Uri %s done." % (cururi)) | |
knownuris[cururi] = None |