More efficient python script

- Fixed pipelining
- Cleaning everything up
- Don't re-download saved characters
- Add the media to .gitignore

About pipelining:

According to python:
1) you send a request
2) you MUST get response headers for (1) (THIS IS MANDATORY)
3) you send another request
4) you get response body for (2)
5) response headers for (3)
6) response body for (5)

Only two requests can be pipelined. Surely this is an unavoidable, wait no it's just written into the code to error out if you don't do it that way.

according to reality:
1) you send a request
2) you do not get response headers for (1)
3) you repeat steps 1-2 until enough responses are queued
4) you receive those responses as header,body,header,body...

they even name it with a __ so to make it hard to override, but the state can safely go to Idle after a request has sent, whether or not response headers have come in. Sure the connection might close, but then you adjust to not pipeline, and re-send the rest of your requests over a new connection.
This commit is contained in:
user 2015-09-21 19:11:25 +00:00 committed by SmallJoker
parent e762283dec
commit f389e6bd13
2 changed files with 168 additions and 47 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
character_*.png
character_*.txt

View File

@ -1,61 +1,180 @@
#!/usr/bin/python3 #!/usr/bin/python3
from http.client import HTTPConnection from http.client import HTTPConnection,HTTPException,BadStatusLine,_CS_IDLE
import json import json
import base64 import base64
from contextlib import closing
import sys,os,shutil,time
def die(message,code=23):
print(message,file=sys.stderr)
raise SystemExit(code)
server = "minetest.fensta.bplaced.net" server = "minetest.fensta.bplaced.net"
skinsdir = "u_skins/textures/" skinsdir = "u_skins/textures/"
metadir = "u_skins/meta/" metadir = "u_skins/meta/"
i = 1 curskin = 0
pages = 1 curpage = 1
pages = None
c = HTTPConnection(server) def replace(location,base,encoding=None,path=None):
def addpage(page): if path is None:
global i, pages path = os.path.join(location,base)
print("Page: " + str(page)) mode = "wt" if encoding else "wb"
r = 0 # an unpredictable temp name only needed for a+rwxt directories
try: tmp = os.path.join(location,'.'+base+'-tmp')
c.request("GET", "/api/get.json.php?getlist&page=" + str(page) + "&outformat=base64") def deco(handle):
r = c.getresponse() with open(tmp,mode,encoding=encoding) as out:
except Exception: handle(out)
if r != 0: os.rename(tmp,path)
if r.status != 200: return deco
print("Error", r.status)
exit(r.status) def maybeReplace(location,base,encoding=None):
return def deco(handle):
path = os.path.join(location,base)
data = r.read().decode() if os.path.exists(path): return
l = json.loads(data) return replace(location,base,encoding=encoding,path=path)(handle)
if not l["success"]: return deco
print("Success != True")
exit(1) class Penguin:
r = 0 "idk"
pages = int(l["pages"]) def __init__(self, url, recv, diemessage):
for s in l["skins"]: self.url = url
f = open(skinsdir + "character_" + str(i) + ".png", "wb") self.recv = recv
f.write(base64.b64decode(bytes(s["img"], 'utf-8'))) self.diemessage = diemessage
f.close()
f = open(metadir + "character_" + str(i) + ".txt", "w") class Pipeline(list):
f.write(str(s["name"]) + '\n') "Gawd why am I being so elaborate?"
f.write(str(s["author"]) + '\n') def __init__(self, threshold=10):
f.write(str(s["license"])) "threshold is how many requests in parallel to pipeline"
f.close() self.threshold = threshold
self.sent = True
def __enter__(self):
self.reopen()
return self
def __exit__(self,typ,exn,trace):
self.send()
self.drain()
def reopen(self):
self.c = HTTPConnection(server)
self.send()
def append(self,url,recv,diemessage):
self.sent = False
super().append(Penguin(url,recv,diemessage))
if len(self) > self.threshold:
self.send()
self.drain()
def trydrain(self):
for penguin in self:
print('drain',penguin.url)
try:
penguin.response.begin()
penguin.recv(penguin.response)
except BadStatusLine as e:
print('derped requesting',penguin.url)
return False
except HTTPException as e:
die(penguin.diemessage+' '+repr(e)+' (url='+penguin.url+')')
self.clear()
return True
def drain(self):
print('draining pipeline...',len(self))
assert self.sent, "Can't drain without sending the requests!"
self.sent = False
while self.trydrain() is not True:
self.c.close()
print('drain failed, trying again')
time.sleep(1)
self.reopen()
def trysend(self):
for penguin in pipeline:
print('fill',penguin.url)
try:
self.c.request("GET", penguin.url)
self.c._HTTPConnection__state = _CS_IDLE
penguin.response = self.c.response_class(self.c.sock,
method="GET")
# begin LATER so we can send multiple requests w/out response headers
except BadStatusLine:
return False
except HTTPException as e:
die(diemessage+' because of a '+repr(e))
return True
def send(self):
if self.sent: return
print('filling pipeline...',len(self))
while self.trysend() is not True:
self.c.close()
print('derped resending')
time.sleep(1)
self.reopen()
self.sent = True
with Pipeline() as pipeline:
# two connections is okay, right? one for json, one for preview images
c = HTTPConnection(server)
def addpage(page):
global curskin, pages
print("Page: " + str(page))
r = 0
try: try:
c.request("GET", "/skins/1/" + str(s["id"]) + ".png") c.request("GET", "/api/get.json.php?getlist&page=" + str(page) + "&outformat=base64")
r = c.getresponse() r = c.getresponse()
except Exception: except Exception:
if r != 0: if r != 0:
if r.status != 200: if r.status != 200:
print("Error", r.status) die("Error", r.status)
continue return
data = r.read() data = r.read().decode()
f = open(skinsdir + "character_" + str(i) + "_preview.png", "wb") l = json.loads(data)
f.write(data) if not l["success"]:
f.close() die("Success != True")
i = i + 1 r = 0
addpage(1) pages = int(l["pages"])
if pages > 1: foundOne = False
for p in range(pages-1): for s in l["skins"]:
addpage(p+2) # make sure to increment this, even if the preview exists!
print("Skins have been updated!") curskin = curskin + 1
previewbase = "character_" + str(curskin) + "_preview.png"
preview = os.path.join(skinsdir, previewbase)
if os.path.exists(preview):
print('skin',curskin,'already retrieved')
continue
print('updating skin',curskin,'id',s["id"])
foundOne = True
@maybeReplace(skinsdir, "character_" + str(curskin) + ".png")
def go(f):
f.write(base64.b64decode(bytes(s["img"], 'utf-8')))
f.close()
@maybeReplace(metadir, "character_" + str(curskin) + ".txt",
encoding='utf-8')
def go(f):
f.write(str(s["name"]) + '\n')
f.write(str(s["author"]) + '\n')
f.write(str(s["license"]))
url = "/skins/1/" + str(s["id"]) + ".png"
def closure(skinsdir,previewbase,preview,s):
"explanation: python sucks"
def tryget(r):
print('replacing',s["id"])
if r.status != 200:
print("Error", r.status)
return
@replace(skinsdir,previewbase,path=preview)
def go(f):
shutil.copyfileobj(r,f)
return tryget
pipeline.append(url,closure(skinsdir,previewbase,preview,s),
"Couldn't get {} because of a".format(
s["id"]))
if not foundOne:
print("No skins updated on this page. Seems we're done?")
#raise SystemExit
addpage(curpage)
while pages > curpage:
curpage = curpage + 1
addpage(curpage)
print("Skins have been updated!")