From 23bb68fe7e9cc8af176ff60b56e8a51a70f05a89 Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Sun, 14 Feb 2021 01:35:10 +0000 Subject: Now generating pages directly from sqlite3 --- tools/references/rpki-wiki-to-markdown.py | 341 ++++++++++++++++++++++++++++++ 1 file changed, 341 insertions(+) create mode 100644 tools/references/rpki-wiki-to-markdown.py (limited to 'tools/references/rpki-wiki-to-markdown.py') diff --git a/tools/references/rpki-wiki-to-markdown.py b/tools/references/rpki-wiki-to-markdown.py new file mode 100644 index 0000000..dff87e6 --- /dev/null +++ b/tools/references/rpki-wiki-to-markdown.py @@ -0,0 +1,341 @@ +# Copyright (C) 2016 Parsons Government Services ("PARSONS") +# Portions copyright (C) 2014 Dragon Research Labs ("DRL") +# Portions copyright (C) 2012 Internet Systems Consortium ("ISC") +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notices and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND PARSONS, DRL, AND ISC DISCLAIM +# ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL +# PARSONS, DRL, OR ISC BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR +# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION +# WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +""" +Trac Wiki -> Markdown converter, hacked from old Trac Wiki -> PDF/flat +text converter. + +Pull HTML pages from a Trac Wiki, feed the useful bits to +html2text to generate Markdown. + +Assumes you're using the TracNav plugin for the Wiki pages, and uses +the same list as the TracNav plugin does to determine the set of pages +to convert. +""" + +# Dependencies, at least on Ubuntu Xenial: +# +# apt-get install python-lxml python-html2text +# +# Be warned that there are many unrelated packages named "html2text", +# installed under various names on various platforms. This one +# happens to be a useful HTML-to-Markdown converter. + +# Most of the work of massaging the HTML is done using XSL transforms, +# because the template-driven style makes that easy. There's probably +# some clever way to use lxml's XPath code to do the same thing in a +# more pythonic way with ElementTrees, but I already had the XSL +# transforms and there's a point of diminishing returns on this sort of +# thing. + +import sys +import os +import argparse +import lxml.etree +import urllib +import urlparse +import subprocess +import zipfile + +# Main program, up front so it doesn't get lost under all the XSL + +def main(): + + base = "https://trac.rpki.net" + + parser = argparse.ArgumentParser(description = __doc__, formatter_class = argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-b", "--base_url", + default = base, + help = "base URL for documentation web site") + parser.add_argument("-t", "--toc", + default = base + "/wiki/doc/RPKI/TOC", + help = "table of contents URL") + parser.add_argument("-d", "--directory", + default = ".", + help = "output directory") + parser.add_argument("-p", "--prefix", + default = "/wiki/doc", + help = "page name prefix on wiki") + args = parser.parse_args() + + urls = str(xsl_get_toc(lxml.etree.parse(urllib.urlopen(args.toc)).getroot(), + basename = repr(args.base_url))).splitlines() + + assert all(urlparse.urlparse(url).path.startswith(args.prefix) for url in urls) + + for pagenum, url in enumerate(urls): + path = urlparse.urlparse(url).path + page = xsl_get_page(lxml.etree.parse(urllib.urlopen(url)).getroot(), + basename = repr(args.base_url), + path = repr(path)) + + fn_base = os.path.join(args.directory, "{:02d}{}".format(pagenum, path[len(args.prefix):].replace("/", "."))) + + fn = fn_base + ".zip" + zip_url = urlparse.urljoin(url, "/zip-attachment{}/".format(path)) + urllib.urlretrieve(zip_url, fn) + with zipfile.ZipFile(fn, "r") as z: + if len(z.namelist()) == 0: + os.unlink(fn) + else: + sys.stderr.write("Wrote {}\n".format(fn)) + + for imgnum, img in enumerate(page.xpath("//img | //object | //embed")): + img_url = img.get("data" if img.tag == "object" else "src") + img_url = urlparse.urljoin(url, img_url) + fn = "{}.{:02d}{}".format(fn_base, imgnum, os.path.splitext(img_url)[1]) + urllib.urlretrieve(img_url, fn) + sys.stderr.write("Wrote {}\n".format(fn)) + + html2markdown = subprocess.Popen(("html2markdown", "--no-skip-internal-links", "--reference-links"), + stdin = subprocess.PIPE, stdout = subprocess.PIPE) + page.write(html2markdown.stdin) + html2markdown.stdin.close() + lines = html2markdown.stdout.readlines() + html2markdown.stdout.close() + html2markdown.wait() + + while lines and lines[0].isspace(): + del lines[0] + + fn = fn_base + ".md" + with open(fn, "w") as f: + want_blank = False + for line in lines: + blank = line.isspace() + if want_blank and not blank: + f.write("\n") + if not blank: + f.write(line) + want_blank = blank + sys.stderr.write("Wrote {}\n".format(fn)) + + fn = fn[:-3] + ".wiki" + urllib.urlretrieve(url + "?format=txt", fn) + sys.stderr.write("Wrote {}\n".format(fn)) + + +# XSL transform to extract list of Wiki page URLs from the TOC Wiki page + +xsl_get_toc = lxml.etree.XSLT(lxml.etree.XML('''\ + + + + + + + + + + + + + +''')) + +# XSL transform to extract useful content of a Wiki page. + +# Django generates weird HTML for ordered lists: it sometimes breaks +# up a single ordered list into multiple adjacent
    elements, +# using the @start attribute to try to make the result look like a +# single ordered list. This looks OK in Firefox but confuses the +# bejesus out of both html2markdown and htmldoc. In some cases this is +# probably unavoidable, but most of the uses of this I've seen look +# gratuitous, and are probably the result of code modulararity issues +# in Django. +# +# So we try to clean this up, by merging adjacent
      elements where +# we can. The merge incantation is an adaptation of: +# +# http://stackoverflow.com/questions/1806123/merging-adjacent-nodes-of-same-type-xslt-1-0 +# +# There may be a more efficient way to do this, but I don't think +# we care, and this seems to work. +# +# Original author's explanation: +# +# The rather convoluted XPath expression for selecting the following +# sibling aaa nodes which are merged with the current one: +# +# following-sibling::aaa[ # following 'aaa' siblings +# not(preceding-sibling::*[ # if they are not preceded by +# not(self::aaa) and # a non-'aaa' node +# not(following-sibling::aaa = current()) # after the current node +# ]) +# ] + +xsl_get_page = lxml.etree.XSLT(lxml.etree.XML('''\ + + + + + + + + + + + + + + + + NEW PAGE + + +
      + +
      + + +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + _ + + + / + . + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +''')) + +# All the files we want to parse are HTML, so make HTML the default +# parser. In theory the HTML produced by Trac is XHTML thus should +# parse correctly (in fact, better) as XML, but in practice this seems +# not to work properly at the moment, while parsing as HTML does. +# Haven't bothered to figure out why, life is too short. +# +# If you're reading this comment because this script stopped working +# after a Trac upgrade, try commenting out this line to see whether +# things have changed and Trac's HTML now parses better as XML. + +lxml.etree.set_default_parser(lxml.etree.HTMLParser()) + +# Run the main program. +main() -- cgit v1.2.3