first commit

2021-03-28 16:33:26 -05:00
commit 7395fa3ae4
2 changed files with 137 additions and 0 deletions
--- a/marxistbook.py
+++ b/marxistbook.py
@@ -0,0 +1,131 @@
+from lxml.html import tostring, document_fromstring
+from argparse import ArgumentParser
+import requests
+import random
+import subprocess
+import os
+from multiprocessing import Pool
+
+def rand_name():
+    return '%030x' % random.randrange(16**30)
+
+def download_chapter(url):
+    print('downloading {}'.format(url))
+    response = requests.get(url)
+    if response.status_code != 200:
+        #TODO find correct exception
+        raise Exception('failed to download page')
+    root = document_fromstring(response.text)
+
+    #find and remove footer
+    comments = root.xpath('//comment()')
+    foot = next(filter(lambda e: 't2h-foot ' in e.text, comments))
+    parent = foot.getparent()
+    prev = foot.getprevious()
+    while prev.getnext()is not None:
+        parent.remove(prev.getnext())
+    
+    randname = rand_name()
+    htmlname = randname + '.html'
+    epubname = randname + '.epub'
+    with open(htmlname, 'wb') as f:
+        f.write(tostring(root))
+    
+    convert_args = ['ebook-convert', htmlname, epubname, '--no-default-epub-cover']
+    print(' '.join(convert_args))
+    comp = subprocess.run(convert_args, stdout=subprocess.DEVNULL)
+    if comp.returncode:
+        #TODO find correct exception
+        raise Exception('subprocess returned error code {}'.format(comp.returncode))
+    os.remove(htmlname)
+    return epubname
+
+def download_book(url):
+    response = requests.get(url)
+    if response.status_code != 200:
+        print('failed to download page')
+        exit(1)
+    root = document_fromstring(response.text)
+
+    #I think this finds multi-page docs
+    spans = root.xpath('//span')
+    toc = None
+    try:
+        toc = next(filter(lambda e: 'toc' in e.classes, spans))
+    except StopIteration:
+        pass
+    if toc is None:
+        #TODO this will download page a second time, allow passing text directly
+        return download_chapter(url)
+    else:
+        #TODO make better
+        title = [e for e in root.xpath('//h3') if 'title' in e.classes][0].text
+
+        base_path = '/'.join(url.split('/')[:-1])
+        spans = root.xpath('//span')
+        spans = filter(lambda e: 'toc' in e.classes, spans)
+        def get_link(s):
+            return base_path + '/' + s.xpath('a')[0].get('href')
+        links = [get_link(s) for s in spans]
+        with Pool() as pool:
+            res = pool.map(download_chapter, links)
+        temp_name = rand_name() + '.epub'
+        merge_args = ['calibre-debug', '--run-plugin', 'EpubMerge', '--',
+            '-N', '-o', temp_name, '-t', title] + res
+        print(' '.join(merge_args))
+        sub = subprocess.run(merge_args, stdout=subprocess.DEVNULL)
+        if sub.returncode:
+            #TODO correct exception
+            raise Exception('merge returned {}'.format(sub.returncode))
+        #cleanup temporary files
+        for f in res:
+            os.remove(f)
+        return temp_name
+        
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('-o', '--output', help='name of output file', dest='output')
+    parser.add_argument('-i', '--input', help='input urls', dest='input', action='append')
+    #parser.add_argument('-e', '--executable', help='directory of calibre executables', dest='exec')
+    parser.add_argument('-t', '--title', help='set the title manually', dest='title', default=None)
+    parser.add_argument('-a', '--author', help='set the author manually', dest='author', default=None)
+    args = parser.parse_args()
+    inp = args.input
+    outp = args.output or 'output.epub'
+   # exec_dir = args.exec
+    title = args.title
+    author = args.author
+
+
+    output_extension = os.path.split(outp)[1]
+
+    books = [download_book(i) for i in inp]
+    if len(books) > 1:
+        temp_name = rand_name() + '.epub'
+        merge_args = ['calibre-debug', '--run-plugin', 'EpubMerge', '--',
+            '-N', '-o', temp_name]
+        if title is not None:
+            merge_args += ['-t', title]
+        if author is not None:
+            merge_args += ['-a', author]
+        merge_args += books
+        sub = subprocess.run(merge_args, stdout=subprocess.DEVNULL)
+        if sub.returncode:
+            #TODO correct exception
+            raise Exception('final merge returned {}'.format(sub.returncode))
+        for book in books:
+            os.remove(book)
+
+    if output_extension == '.epub':
+        os.rename(temp_name, outp)
+    else:
+        sub = subprocess.run(['ebook-convert', temp_name, outp], stdout=subprocess.DEVNULL)
+        if sub.returncode:
+            #TODO correct exception
+            raise Exception('final conversion returned {}'.format(sub.returncode))
+        os.remove(temp_name)
+    print('created {}'.format(outp))
+
+if __name__ == '__main__':
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+certifi==2020.12.5
+chardet==4.0.0
+idna==2.10
+lxml==4.6.3
+requests==2.25.1
+urllib3==1.26.4