Files
MarxistEbook/index_crawler.py
2021-03-30 00:58:26 -05:00

217 lines
7.6 KiB
Python

from sys import argv
import requests
from lxml.html import tostring, document_fromstring
from argparse import ArgumentParser
import random
import subprocess
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import os
import os.path
from queue import SimpleQueue, Empty
class TaskItems:
def __init__(self, errors404: SimpleQueue, store_images: bool =False,
trim: bool =True):
self.errors404 = errors404
self.store_images = store_images
self.trim = trim
def rand_name():
return '%030x' % random.randrange(16**30)
def trim_chapter(root):
#find by comment
try:
comments = root.xpath('//comment()')
foot = next(filter(lambda e: 't2h-foot ' in e.text, comments))
parent = foot.getparent()
prev = foot.getprevious()
while prev.getnext() is not None:
parent.remove(prev.getnext())
except StopIteration:
pass
#find by class
try:
pars = root.xpath('//p')
footers = [foot for foot in pars if 'footer' in foot.classes]
for foot in footers:
parent = foot.getparent()
parent.remove(foot)
except StopIteration:
pass
def process_chapter(url, taskItems: TaskItems =None):
assert(url)
base_name = rand_name()
html_name = base_name + '.html'
epub_name = base_name + '.epub'
try:
print('downloading {}'.format(url))
resp = requests.get(url)
if resp.status_code != 200:
if resp.status_code == 404:
print('received 404, skipping {}'.format(url))
if taskItems:
taskItems.errors404.put(url)
return ''
print('received error code {}'.format(resp.status_code))
return None
root = document_fromstring(resp.text)
if taskItems and taskItems.trim:
trim_chapter(root)
with open(html_name, 'wb') as f:
f.write(tostring(root))
convert_args = ['ebook-convert', html_name, epub_name, '--no-default-epub-cover']
print(' '.join(convert_args))
res = subprocess.run(convert_args, stdout=subprocess.DEVNULL)
if res.returncode != 0:
return None
return epub_name
#cleanup temp files
finally:
try:
os.remove(html_name)
except:
pass
def process_volume(url, taskItems: TaskItems =None):
assert(url)
base_url = '/'.join(url.split('/')[:-1]) + '/'
print('downloading {}'.format(url))
resp = requests.get(url)
if resp.status_code != 200:
if resp.status_code == 404:
print('received 404, skipping {}'.format(url))
if taskItems:
taskItems.errors404.put(url)
print('received error code {}'.format(resp.status_code))
return None
root = document_fromstring(resp.text)
anchors = root.xpath('//a')
chapter_urls = []
for a in anchors:
href = a.get('href')
if href is None:
continue
if '/' in href or '#' in href:
continue
if not (href.endswith('.htm') or href.endswith('.html')):
continue
chapter_urls.append(href)
chapter_names = []
try:
for chapter_url in chapter_urls:
ch = process_chapter(base_url + chapter_url, taskItems)
if ch is None:
return None
#empty strings mean an intentionally skipped chapter
if ch == '':
continue
chapter_names.append(ch)
if not chapter_names:
return ''
vol_name = rand_name() + '.epub'
merge_args = ['calibre-debug', '--run-plugin', 'EpubMerge', '--',
'-N', '-o', vol_name] + chapter_names
print(' '.join(merge_args))
res = subprocess.run(merge_args, stdout=subprocess.DEVNULL)
if res.returncode:
return None
return vol_name
#cleanup temp files
finally:
for ch in chapter_names:
try:
os.remove(ch)
except:
pass
def main(cli_args):
parser = ArgumentParser()
parser.add_argument('-o', '--output', help='output file name', type=str)
parser.add_argument('-t', '--title', help='ebook title', type=str)
parser.add_argument('-a', '--author', help="DOESN'T WORK ebook author", type=str)
parser.add_argument('-g', '--tag', help='apply a tag', action='append')
parser.add_argument('-I', '--images', help='NOT IMPLEMENTED also attempt to download any images', action='store_true')
parser.add_argument('-C', '--auto-cover', help='NOT IMPLEMENTED generate an automatic cover', action='store_true', dest='cover')
parser.add_argument('-T', '--no-trim', help="don't try to trim footer at bottom of chapters", action='store_false', dest='trim')
parser.add_argument('url', help='url to download', nargs='+')
#args = parser.parse_args(cli_args)
args = parser.parse_args()
urls = args.url
name = args.output or 'output.epub'
title = args.title
author = args.author
tags = args.tag
store_images = args.images
cover = args.cover
trim = args.trim
taskItems = TaskItems(errors404=SimpleQueue(), store_images=store_images,
trim=trim)
documents = []
with ThreadPoolExecutor() as executor:
try:
for item in urls:
if 'index' in item:
documents.append(executor.submit(process_volume, url=item, taskItems=taskItems))
else:
documents.append(executor.submit(process_chapter, url=item, taskItems=taskItems))
try:
concurrent.futures.wait(documents, timeout=10*60)
except concurrent.futures.TimeoutError:
print('Timeout while waiting for tasks')
return 1
docs = [d.result() for d in documents]
docs = [d for d in docs if d]
if cover or not cover.endswith('.epub'):
merge_name = rand_name() + 'epub'
else:
merge_name = name
merge_args = ['calibre-debug', '--run-plugin', 'EpubMerge', '--',
'-N', '-o', merge_name]
if title:
merge_args += ['-t', title]
if author:
merge_args += ['-a', author]
for tag in tags:
merge_args += ['-g', tag]
merge_args += docs
print(' '.join(merge_args))
res = subprocess.run(merge_args, stdout=subprocess.DEVNULL)
if res.returncode:
print('final merge failed')
return 1
if name != merge_name:
#TODO to generate cover, run an epub conversion here
try:
convert_args = ['ebook-convert', merge_name, name]
print(' '.join(convert_args))
res = subprocess.run(convert_args, stdout=subprocess.DEVNULL)
if res.returncode:
print('final conversion failed')
return res.returncode
finally:
try:
os.remove(merge_name)
except:
pass
return 0
#cleanup temp files
finally:
try:
while True:
print('received 404: {}'.format(taskItems.errors404.get(False)))
except Empty:
pass
for item in docs:
try:
os.remove(item)
except:
pass
if __name__ == '__main__':
exit(main(argv[1:]))