moving to index_crawler.py, which is much more successful but missing some features
This commit is contained in:
161
index_crawler.py
Normal file
161
index_crawler.py
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
from sys import argv
|
||||||
|
import requests
|
||||||
|
from lxml.html import tostring, document_fromstring
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
import random
|
||||||
|
import subprocess
|
||||||
|
import concurrent.futures
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
import os
|
||||||
|
import os.path
|
||||||
|
from queue import SimpleQueue, Empty
|
||||||
|
|
||||||
|
class TaskItems:
|
||||||
|
def __init__(self, errors404: SimpleQueue):
|
||||||
|
self.errors404 = errors404
|
||||||
|
|
||||||
|
def rand_name():
|
||||||
|
return '%030x' % random.randrange(16**30)
|
||||||
|
|
||||||
|
def clip_chapter(root):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_chapter(url, taskItems: TaskItems =None):
|
||||||
|
assert(url)
|
||||||
|
base_name = rand_name()
|
||||||
|
html_name = base_name + '.html'
|
||||||
|
epub_name = base_name + '.epub'
|
||||||
|
try:
|
||||||
|
print('downloading {}'.format(url))
|
||||||
|
resp = requests.get(url)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
if resp.status_code == 404:
|
||||||
|
print('received 404, skipping {}'.format(url))
|
||||||
|
if taskItems:
|
||||||
|
taskItems.errors404.put(url)
|
||||||
|
return ''
|
||||||
|
print('received error code {}'.format(resp.status_code))
|
||||||
|
return None
|
||||||
|
root = document_fromstring(resp.text)
|
||||||
|
clip_chapter(root)
|
||||||
|
with open(html_name, 'wb') as f:
|
||||||
|
f.write(tostring(root))
|
||||||
|
convert_args = ['ebook-convert', html_name, epub_name, '--no-default-epub-cover']
|
||||||
|
print(' '.join(convert_args))
|
||||||
|
res = subprocess.run(convert_args, stdout=subprocess.DEVNULL)
|
||||||
|
if res.returncode != 0:
|
||||||
|
return None
|
||||||
|
return epub_name
|
||||||
|
|
||||||
|
#cleanup temp files
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.remove(html_name)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_volume(url, taskItems: TaskItems =None):
|
||||||
|
assert(url)
|
||||||
|
base_url = '/'.join(url.split('/')[:-1]) + '/'
|
||||||
|
print('downloading {}'.format(url))
|
||||||
|
resp = requests.get(url)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
if resp.status_code == 404:
|
||||||
|
print('received 404, skipping {}'.format(url))
|
||||||
|
if taskItems:
|
||||||
|
taskItems.errors404.put(url)
|
||||||
|
print('received error code {}'.format(resp.status_code))
|
||||||
|
return None
|
||||||
|
root = document_fromstring(resp.text)
|
||||||
|
anchors = root.xpath('//a')
|
||||||
|
chapter_urls = []
|
||||||
|
for a in anchors:
|
||||||
|
href = a.get('href')
|
||||||
|
if href is None:
|
||||||
|
continue
|
||||||
|
if '/' in href or '#' in href:
|
||||||
|
continue
|
||||||
|
if not (href.endswith('.htm') or href.endswith('.html')):
|
||||||
|
continue
|
||||||
|
chapter_urls.append(href)
|
||||||
|
chapter_names = []
|
||||||
|
try:
|
||||||
|
for chapter_url in chapter_urls:
|
||||||
|
ch = process_chapter(base_url + chapter_url, taskItems)
|
||||||
|
if ch is None:
|
||||||
|
return None
|
||||||
|
#empty strings mean an intentionally skipped chapter
|
||||||
|
if ch == '':
|
||||||
|
continue
|
||||||
|
chapter_names.append(ch)
|
||||||
|
if not chapter_names:
|
||||||
|
return ''
|
||||||
|
vol_name = rand_name() + '.epub'
|
||||||
|
merge_args = ['calibre-debug', '--run-plugin', 'EpubMerge', '--',
|
||||||
|
'-N', '-o', vol_name] + chapter_names
|
||||||
|
print(' '.join(merge_args))
|
||||||
|
res = subprocess.run(merge_args, stdout=subprocess.DEVNULL)
|
||||||
|
if res.returncode:
|
||||||
|
return None
|
||||||
|
return vol_name
|
||||||
|
#cleanup temp files
|
||||||
|
finally:
|
||||||
|
for ch in chapter_names:
|
||||||
|
try:
|
||||||
|
os.remove(ch)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def main(cli_args):
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument('-o', '--output', help='output file name', type=str)
|
||||||
|
parser.add_argument('-t', '--title', help='ebook title', type=str)
|
||||||
|
parser.add_argument('-g', '--tag', help='apply a tag', action='append')
|
||||||
|
parser.add_argument('url', help='url to download', nargs='+')
|
||||||
|
#args = parser.parse_args(cli_args)
|
||||||
|
args = parser.parse_args()
|
||||||
|
urls = args.url
|
||||||
|
name = args.output or 'output.epub'
|
||||||
|
title = args.title
|
||||||
|
tags = args.tag
|
||||||
|
taskItems = TaskItems(errors404=SimpleQueue())
|
||||||
|
documents = []
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
try:
|
||||||
|
for item in urls:
|
||||||
|
if 'index' in item:
|
||||||
|
documents.append(executor.submit(process_volume, url=item, taskItems=taskItems))
|
||||||
|
else:
|
||||||
|
documents.append(executor.submit(process_chapter, url=item, taskItems=taskItems))
|
||||||
|
concurrent.futures.wait(documents, timeout=10*60)
|
||||||
|
docs = [d.result() for d in documents]
|
||||||
|
docs = [d for d in docs if d]
|
||||||
|
|
||||||
|
merge_args = ['calibre-debug', '--run-plugin', 'EpubMerge', '--',
|
||||||
|
'-N', '-o', name]
|
||||||
|
if title:
|
||||||
|
merge_args += ['-t', title]
|
||||||
|
for tag in tags:
|
||||||
|
merge_args += ['-g', tag]
|
||||||
|
merge_args += docs
|
||||||
|
print(' '.join(merge_args))
|
||||||
|
res = subprocess.run(merge_args, stdout=subprocess.DEVNULL)
|
||||||
|
if res.returncode:
|
||||||
|
print('final merge failed')
|
||||||
|
return 1
|
||||||
|
return 0
|
||||||
|
#cleanup temp files
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
print('received 404: {}'.format(taskItems.errors404.get(False)))
|
||||||
|
except Empty:
|
||||||
|
pass
|
||||||
|
for item in docs:
|
||||||
|
try:
|
||||||
|
os.remove(item)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
exit(main(argv[1:]))
|
||||||
@@ -4,11 +4,19 @@ import requests
|
|||||||
import random
|
import random
|
||||||
import subprocess
|
import subprocess
|
||||||
import os
|
import os
|
||||||
|
from sys import argv
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
|
|
||||||
def rand_name():
|
def rand_name():
|
||||||
return '%030x' % random.randrange(16**30)
|
return '%030x' % random.randrange(16**30)
|
||||||
|
|
||||||
|
def all_children(node):
|
||||||
|
for child in node.getchildren():
|
||||||
|
for item in all_children(child):
|
||||||
|
yield item
|
||||||
|
yield child
|
||||||
|
yield node
|
||||||
|
|
||||||
def download_chapter(url):
|
def download_chapter(url):
|
||||||
print('downloading {}'.format(url))
|
print('downloading {}'.format(url))
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
@@ -18,12 +26,15 @@ def download_chapter(url):
|
|||||||
root = document_fromstring(response.text)
|
root = document_fromstring(response.text)
|
||||||
|
|
||||||
#find and remove footer
|
#find and remove footer
|
||||||
comments = root.xpath('//comment()')
|
try:
|
||||||
foot = next(filter(lambda e: 't2h-foot ' in e.text, comments))
|
comments = root.xpath('//comment()')
|
||||||
parent = foot.getparent()
|
foot = next(filter(lambda e: 't2h-foot ' in e.text, comments))
|
||||||
prev = foot.getprevious()
|
parent = foot.getparent()
|
||||||
while prev.getnext() is not None:
|
prev = foot.getprevious()
|
||||||
parent.remove(prev.getnext())
|
while prev.getnext() is not None:
|
||||||
|
parent.remove(prev.getnext())
|
||||||
|
except StopIteration:
|
||||||
|
pass
|
||||||
|
|
||||||
randname = rand_name()
|
randname = rand_name()
|
||||||
htmlname = randname + '.html'
|
htmlname = randname + '.html'
|
||||||
@@ -44,20 +55,14 @@ def download_book(url):
|
|||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
print('failed to download page')
|
print('failed to download page')
|
||||||
|
#this shouldn't exit
|
||||||
exit(1)
|
exit(1)
|
||||||
root = document_fromstring(response.text)
|
root = document_fromstring(response.text)
|
||||||
|
|
||||||
#I think this finds multi-page docs
|
#I think this finds multi-page docs
|
||||||
spans = root.xpath('//span')
|
tocs = [node for node in all_children(root) if 'toc' in node.classes]
|
||||||
toc = None
|
|
||||||
try:
|
if tocs:
|
||||||
toc = next(filter(lambda e: 'toc' in e.classes, spans))
|
|
||||||
except StopIteration:
|
|
||||||
pass
|
|
||||||
if toc is None:
|
|
||||||
#TODO this will download page a second time, allow passing text directly
|
|
||||||
return download_chapter(url)
|
|
||||||
else:
|
|
||||||
#TODO make better
|
#TODO make better
|
||||||
title = [e for e in root.xpath('//h3') if 'title' in e.classes][0].text
|
title = [e for e in root.xpath('//h3') if 'title' in e.classes][0].text
|
||||||
|
|
||||||
@@ -81,9 +86,14 @@ def download_book(url):
|
|||||||
for f in res:
|
for f in res:
|
||||||
os.remove(f)
|
os.remove(f)
|
||||||
return temp_name
|
return temp_name
|
||||||
|
else:
|
||||||
|
#TODO this will download page a second time, allow passing text directly
|
||||||
|
return download_chapter(url)
|
||||||
|
else:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main(cli_args):
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
parser.add_argument('-o', '--output', help='name of output file', dest='output')
|
parser.add_argument('-o', '--output', help='name of output file', dest='output')
|
||||||
#parser.add_argument('-i', '--input', help='input urls', dest='input', action='append')
|
#parser.add_argument('-i', '--input', help='input urls', dest='input', action='append')
|
||||||
@@ -94,7 +104,7 @@ def main():
|
|||||||
parser.add_argument('-g', '--tag', help='apply a tag', action='append')
|
parser.add_argument('-g', '--tag', help='apply a tag', action='append')
|
||||||
parser.add_argument('-l', '--language', help='set language', default=None)
|
parser.add_argument('-l', '--language', help='set language', default=None)
|
||||||
parser.add_argument('url', help='urls to download', nargs='+')
|
parser.add_argument('url', help='urls to download', nargs='+')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args(cli_args)
|
||||||
#inp = args.input
|
#inp = args.input
|
||||||
urls = args.url
|
urls = args.url
|
||||||
outp = args.output or 'output.epub'
|
outp = args.output or 'output.epub'
|
||||||
@@ -105,6 +115,12 @@ def main():
|
|||||||
tags = args.tag
|
tags = args.tag
|
||||||
lanugage = args.language
|
lanugage = args.language
|
||||||
|
|
||||||
|
for i in range(len(urls)-1):
|
||||||
|
if urls[i] in urls[i+1:]:
|
||||||
|
print('duplicate url:')
|
||||||
|
print(urls[i])
|
||||||
|
return 1
|
||||||
|
|
||||||
output_extension = os.path.split(outp)[1]
|
output_extension = os.path.split(outp)[1]
|
||||||
|
|
||||||
books = [download_book(i) for i in urls]
|
books = [download_book(i) for i in urls]
|
||||||
@@ -143,6 +159,7 @@ def main():
|
|||||||
raise Exception('final conversion returned {}'.format(sub.returncode))
|
raise Exception('final conversion returned {}'.format(sub.returncode))
|
||||||
os.remove(temp_name)
|
os.remove(temp_name)
|
||||||
print('created {}'.format(outp))
|
print('created {}'.format(outp))
|
||||||
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
exit(main(argv))
|
||||||
|
|||||||
Reference in New Issue
Block a user