From c696c2501c9fb951d37bd9f51b39066dd382b58b Mon Sep 17 00:00:00 2001 From: SexbearLmao Date: Mon, 29 Mar 2021 23:56:46 -0500 Subject: [PATCH] moving to index_crawler.py, which is much more successful but missing some features --- index_crawler.py | 161 +++++++++++++++++++++++++++++++++++++++++++++++ marxistbook.py | 55 ++++++++++------ 2 files changed, 197 insertions(+), 19 deletions(-) create mode 100644 index_crawler.py diff --git a/index_crawler.py b/index_crawler.py new file mode 100644 index 0000000..ecbd61d --- /dev/null +++ b/index_crawler.py @@ -0,0 +1,161 @@ +from sys import argv +import requests +from lxml.html import tostring, document_fromstring +from argparse import ArgumentParser +import random +import subprocess +import concurrent.futures +from concurrent.futures import ThreadPoolExecutor +import os +import os.path +from queue import SimpleQueue, Empty + +class TaskItems: + def __init__(self, errors404: SimpleQueue): + self.errors404 = errors404 + +def rand_name(): + return '%030x' % random.randrange(16**30) + +def clip_chapter(root): + pass + +def process_chapter(url, taskItems: TaskItems =None): + assert(url) + base_name = rand_name() + html_name = base_name + '.html' + epub_name = base_name + '.epub' + try: + print('downloading {}'.format(url)) + resp = requests.get(url) + if resp.status_code != 200: + if resp.status_code == 404: + print('received 404, skipping {}'.format(url)) + if taskItems: + taskItems.errors404.put(url) + return '' + print('received error code {}'.format(resp.status_code)) + return None + root = document_fromstring(resp.text) + clip_chapter(root) + with open(html_name, 'wb') as f: + f.write(tostring(root)) + convert_args = ['ebook-convert', html_name, epub_name, '--no-default-epub-cover'] + print(' '.join(convert_args)) + res = subprocess.run(convert_args, stdout=subprocess.DEVNULL) + if res.returncode != 0: + return None + return epub_name + + #cleanup temp files + finally: + try: + os.remove(html_name) + except: + pass + +def process_volume(url, taskItems: TaskItems =None): + assert(url) + base_url = '/'.join(url.split('/')[:-1]) + '/' + print('downloading {}'.format(url)) + resp = requests.get(url) + if resp.status_code != 200: + if resp.status_code == 404: + print('received 404, skipping {}'.format(url)) + if taskItems: + taskItems.errors404.put(url) + print('received error code {}'.format(resp.status_code)) + return None + root = document_fromstring(resp.text) + anchors = root.xpath('//a') + chapter_urls = [] + for a in anchors: + href = a.get('href') + if href is None: + continue + if '/' in href or '#' in href: + continue + if not (href.endswith('.htm') or href.endswith('.html')): + continue + chapter_urls.append(href) + chapter_names = [] + try: + for chapter_url in chapter_urls: + ch = process_chapter(base_url + chapter_url, taskItems) + if ch is None: + return None + #empty strings mean an intentionally skipped chapter + if ch == '': + continue + chapter_names.append(ch) + if not chapter_names: + return '' + vol_name = rand_name() + '.epub' + merge_args = ['calibre-debug', '--run-plugin', 'EpubMerge', '--', + '-N', '-o', vol_name] + chapter_names + print(' '.join(merge_args)) + res = subprocess.run(merge_args, stdout=subprocess.DEVNULL) + if res.returncode: + return None + return vol_name + #cleanup temp files + finally: + for ch in chapter_names: + try: + os.remove(ch) + except: + pass + +def main(cli_args): + parser = ArgumentParser() + parser.add_argument('-o', '--output', help='output file name', type=str) + parser.add_argument('-t', '--title', help='ebook title', type=str) + parser.add_argument('-g', '--tag', help='apply a tag', action='append') + parser.add_argument('url', help='url to download', nargs='+') + #args = parser.parse_args(cli_args) + args = parser.parse_args() + urls = args.url + name = args.output or 'output.epub' + title = args.title + tags = args.tag + taskItems = TaskItems(errors404=SimpleQueue()) + documents = [] + with ThreadPoolExecutor() as executor: + try: + for item in urls: + if 'index' in item: + documents.append(executor.submit(process_volume, url=item, taskItems=taskItems)) + else: + documents.append(executor.submit(process_chapter, url=item, taskItems=taskItems)) + concurrent.futures.wait(documents, timeout=10*60) + docs = [d.result() for d in documents] + docs = [d for d in docs if d] + + merge_args = ['calibre-debug', '--run-plugin', 'EpubMerge', '--', + '-N', '-o', name] + if title: + merge_args += ['-t', title] + for tag in tags: + merge_args += ['-g', tag] + merge_args += docs + print(' '.join(merge_args)) + res = subprocess.run(merge_args, stdout=subprocess.DEVNULL) + if res.returncode: + print('final merge failed') + return 1 + return 0 + #cleanup temp files + finally: + try: + while True: + print('received 404: {}'.format(taskItems.errors404.get(False))) + except Empty: + pass + for item in docs: + try: + os.remove(item) + except: + pass + +if __name__ == '__main__': + exit(main(argv[1:])) diff --git a/marxistbook.py b/marxistbook.py index 6105f8c..cc767d4 100644 --- a/marxistbook.py +++ b/marxistbook.py @@ -4,11 +4,19 @@ import requests import random import subprocess import os +from sys import argv from multiprocessing import Pool def rand_name(): return '%030x' % random.randrange(16**30) +def all_children(node): + for child in node.getchildren(): + for item in all_children(child): + yield item + yield child + yield node + def download_chapter(url): print('downloading {}'.format(url)) response = requests.get(url) @@ -18,12 +26,15 @@ def download_chapter(url): root = document_fromstring(response.text) #find and remove footer - comments = root.xpath('//comment()') - foot = next(filter(lambda e: 't2h-foot ' in e.text, comments)) - parent = foot.getparent() - prev = foot.getprevious() - while prev.getnext() is not None: - parent.remove(prev.getnext()) + try: + comments = root.xpath('//comment()') + foot = next(filter(lambda e: 't2h-foot ' in e.text, comments)) + parent = foot.getparent() + prev = foot.getprevious() + while prev.getnext() is not None: + parent.remove(prev.getnext()) + except StopIteration: + pass randname = rand_name() htmlname = randname + '.html' @@ -44,20 +55,14 @@ def download_book(url): response = requests.get(url) if response.status_code != 200: print('failed to download page') + #this shouldn't exit exit(1) root = document_fromstring(response.text) #I think this finds multi-page docs - spans = root.xpath('//span') - toc = None - try: - toc = next(filter(lambda e: 'toc' in e.classes, spans)) - except StopIteration: - pass - if toc is None: - #TODO this will download page a second time, allow passing text directly - return download_chapter(url) - else: + tocs = [node for node in all_children(root) if 'toc' in node.classes] + + if tocs: #TODO make better title = [e for e in root.xpath('//h3') if 'title' in e.classes][0].text @@ -81,9 +86,14 @@ def download_book(url): for f in res: os.remove(f) return temp_name + else: + #TODO this will download page a second time, allow passing text directly + return download_chapter(url) + else: + -def main(): +def main(cli_args): parser = ArgumentParser() parser.add_argument('-o', '--output', help='name of output file', dest='output') #parser.add_argument('-i', '--input', help='input urls', dest='input', action='append') @@ -94,7 +104,7 @@ def main(): parser.add_argument('-g', '--tag', help='apply a tag', action='append') parser.add_argument('-l', '--language', help='set language', default=None) parser.add_argument('url', help='urls to download', nargs='+') - args = parser.parse_args() + args = parser.parse_args(cli_args) #inp = args.input urls = args.url outp = args.output or 'output.epub' @@ -105,6 +115,12 @@ def main(): tags = args.tag lanugage = args.language + for i in range(len(urls)-1): + if urls[i] in urls[i+1:]: + print('duplicate url:') + print(urls[i]) + return 1 + output_extension = os.path.split(outp)[1] books = [download_book(i) for i in urls] @@ -143,6 +159,7 @@ def main(): raise Exception('final conversion returned {}'.format(sub.returncode)) os.remove(temp_name) print('created {}'.format(outp)) + return 0 if __name__ == '__main__': - main() + exit(main(argv))