From 23bb68fe7e9cc8af176ff60b56e8a51a70f05a89 Mon Sep 17 00:00:00 2001
From: Rob Austein <sra@hactrn.net>
Date: Sun, 14 Feb 2021 01:35:10 +0000
Subject: Now generating pages directly from sqlite3

---
 tools/rpki-wiki-to-markdown.py | 341 -----------------------------------------
 1 file changed, 341 deletions(-)
 delete mode 100644 tools/rpki-wiki-to-markdown.py

(limited to 'tools/rpki-wiki-to-markdown.py')

diff --git a/tools/rpki-wiki-to-markdown.py b/tools/rpki-wiki-to-markdown.py
deleted file mode 100644
index dff87e6..0000000
--- a/tools/rpki-wiki-to-markdown.py
+++ /dev/null
@@ -1,341 +0,0 @@
-# Copyright (C) 2016  Parsons Government Services ("PARSONS")
-# Portions copyright (C) 2014  Dragon Research Labs ("DRL")
-# Portions copyright (C) 2012  Internet Systems Consortium ("ISC")
-#
-# Permission to use, copy, modify, and distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notices and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND PARSONS, DRL, AND ISC DISCLAIM
-# ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS.  IN NO EVENT SHALL
-# PARSONS, DRL, OR ISC BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
-# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
-# WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-"""
-Trac Wiki -> Markdown converter, hacked from old Trac Wiki -> PDF/flat
-text converter.
-
-Pull HTML pages from a Trac Wiki, feed the useful bits to 
-html2text to generate Markdown.
-
-Assumes you're using the TracNav plugin for the Wiki pages, and uses
-the same list as the TracNav plugin does to determine the set of pages
-to convert.
-"""
-
-# Dependencies, at least on Ubuntu Xenial:
-#
-#   apt-get install python-lxml python-html2text
-#
-# Be warned that there are many unrelated packages named "html2text",
-# installed under various names on various platforms.  This one
-# happens to be a useful HTML-to-Markdown converter.
-
-# Most of the work of massaging the HTML is done using XSL transforms,
-# because the template-driven style makes that easy.  There's probably
-# some clever way to use lxml's XPath code to do the same thing in a
-# more pythonic way with ElementTrees, but I already had the XSL
-# transforms and there's a point of diminishing returns on this sort of
-# thing.
-
-import sys
-import os
-import argparse
-import lxml.etree
-import urllib
-import urlparse
-import subprocess
-import zipfile
-
-# Main program, up front so it doesn't get lost under all the XSL
-
-def main():
-
-    base = "https://trac.rpki.net"
-
-    parser = argparse.ArgumentParser(description = __doc__, formatter_class = argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("-b", "--base_url",
-                        default = base,
-                        help = "base URL for documentation web site")
-    parser.add_argument("-t", "--toc",
-                        default = base + "/wiki/doc/RPKI/TOC",
-                        help = "table of contents URL")
-    parser.add_argument("-d", "--directory",
-                        default = ".",
-                        help = "output directory")
-    parser.add_argument("-p", "--prefix",
-                        default = "/wiki/doc",
-                        help = "page name prefix on wiki")
-    args = parser.parse_args()
-
-    urls = str(xsl_get_toc(lxml.etree.parse(urllib.urlopen(args.toc)).getroot(),
-                           basename = repr(args.base_url))).splitlines()
-
-    assert all(urlparse.urlparse(url).path.startswith(args.prefix) for url in urls)
-
-    for pagenum, url in enumerate(urls):
-        path = urlparse.urlparse(url).path
-        page = xsl_get_page(lxml.etree.parse(urllib.urlopen(url)).getroot(),
-                            basename = repr(args.base_url),
-                            path = repr(path))
-
-        fn_base = os.path.join(args.directory, "{:02d}{}".format(pagenum, path[len(args.prefix):].replace("/", ".")))
-
-        fn = fn_base + ".zip"
-        zip_url = urlparse.urljoin(url, "/zip-attachment{}/".format(path))
-        urllib.urlretrieve(zip_url, fn)
-        with zipfile.ZipFile(fn, "r") as z:
-            if len(z.namelist()) == 0:
-                os.unlink(fn)
-            else:
-                sys.stderr.write("Wrote {}\n".format(fn))
-
-        for imgnum, img in enumerate(page.xpath("//img | //object | //embed")):
-            img_url = img.get("data" if img.tag == "object" else "src")
-            img_url = urlparse.urljoin(url, img_url)
-            fn = "{}.{:02d}{}".format(fn_base, imgnum, os.path.splitext(img_url)[1])
-            urllib.urlretrieve(img_url, fn)
-            sys.stderr.write("Wrote {}\n".format(fn))
-
-        html2markdown = subprocess.Popen(("html2markdown", "--no-skip-internal-links", "--reference-links"),
-                                         stdin = subprocess.PIPE, stdout = subprocess.PIPE)
-        page.write(html2markdown.stdin)
-        html2markdown.stdin.close()
-        lines = html2markdown.stdout.readlines()
-        html2markdown.stdout.close()
-        html2markdown.wait()
-
-        while lines and lines[0].isspace():
-            del lines[0]
-
-        fn = fn_base + ".md"
-        with open(fn, "w") as f:
-            want_blank = False
-            for line in lines:
-                blank = line.isspace()
-                if want_blank and not blank:
-                    f.write("\n")
-                if not blank:
-                    f.write(line)
-                want_blank = blank
-        sys.stderr.write("Wrote {}\n".format(fn))
-
-        fn = fn[:-3] + ".wiki"
-        urllib.urlretrieve(url + "?format=txt", fn)
-        sys.stderr.write("Wrote {}\n".format(fn))
-
-
-# XSL transform to extract list of Wiki page URLs from the TOC Wiki page
-
-xsl_get_toc = lxml.etree.XSLT(lxml.etree.XML('''\
-  <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-                 version="1.0">
-
-    <xsl:output method="text" encoding="us-ascii"/>
-
-    <xsl:param name="basename"/>
-
-    <xsl:template match="/">
-      <xsl:for-each select="//div[@id = 'wikipage']/ul//a">
-        <xsl:value-of select="concat($basename, @href, '&#10;')"/>
-      </xsl:for-each>
-    </xsl:template>
-
-  </xsl:transform>
-'''))
-
-# XSL transform to extract useful content of a Wiki page.
-
-# Django generates weird HTML for ordered lists: it sometimes breaks
-# up a single ordered list into multiple adjacent <ol/> elements,
-# using the @start attribute to try to make the result look like a
-# single ordered list.  This looks OK in Firefox but confuses the
-# bejesus out of both html2markdown and htmldoc.  In some cases this is
-# probably unavoidable, but most of the uses of this I've seen look
-# gratuitous, and are probably the result of code modulararity issues
-# in Django.
-#
-# So we try to clean this up, by merging adjacent <ol/> elements where
-# we can.  The merge incantation is an adaptation of:
-#
-# http://stackoverflow.com/questions/1806123/merging-adjacent-nodes-of-same-type-xslt-1-0
-#
-# There may be a more efficient way to do this, but I don't think
-# we care, and this seems to work.
-#
-# Original author's explanation:
-#
-# The rather convoluted XPath expression for selecting the following
-# sibling aaa nodes which are merged with the current one:
-#
-# following-sibling::aaa[                       # following 'aaa' siblings
-#   not(preceding-sibling::*[                   #   if they are not preceded by
-#     not(self::aaa) and                        #     a non-'aaa' node
-#     not(following-sibling::aaa = current())   #     after the current node
-#   ])
-# ]
-
-xsl_get_page = lxml.etree.XSLT(lxml.etree.XML('''\
-  <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
-
-    <xsl:output method="xml" encoding="us-ascii" omit-xml-declaration="yes" />
-
-    <xsl:param name="basename"/>
-    <xsl:param name="path"/>
-
-    <xsl:template match="/">
-      <xsl:message><xsl:value-of select="concat('Got path: ', $path)"/></xsl:message>
-      <xsl:variable name="id">
-        <xsl:call-template name="path-to-id">
-          <xsl:with-param name="p" select="$path"/>
-        </xsl:call-template>
-      </xsl:variable>
-      <xsl:message><xsl:value-of select="concat('Got id: ', $id)"/></xsl:message>
-      <xsl:comment>NEW PAGE</xsl:comment>
-      <html>
-        <body>
-          <div id="{$id}">
-            <xsl:apply-templates select="//div[@id = 'wikipage']/*"/>
-          </div>
-        </body>
-      </html>
-    </xsl:template>
-
-    <xsl:template match="//div[contains(@class, 'wiki-toc')]"/>
-
-    <xsl:template match="//span[@class = 'icon' and not(*)]"/>
-
-    <xsl:template match="a[contains(@class, 'wiki') and
-                           starts-with(@href, '/wiki/')]">
-      <xsl:variable name="href">
-        <xsl:call-template name="path-to-id">
-          <xsl:with-param name="p" select="@href"/>
-        </xsl:call-template>
-      </xsl:variable>
-      <a href="#{$href}">
-        <xsl:apply-templates select="@*[name() != 'href']"/>
-        <xsl:apply-templates/>
-      </a>
-    </xsl:template>
-
-    <xsl:template match="a[starts-with(@href, '/attachment/wiki/')]">
-      <a href="{concat($basename, @href)}">
-        <xsl:apply-templates select="@*[name() != 'href']"/>
-        <xsl:apply-templates/>
-      </a>
-    </xsl:template>
-
-    <xsl:template match="img[starts-with(@src, '/raw-attachment/wiki/')]">
-      <img src="{concat($basename, @src)}">
-        <xsl:apply-templates select="@*[name() != 'src']"/>
-        <xsl:apply-templates/>
-      </img>
-    </xsl:template>
-
-    <xsl:template match="object[starts-with(@data, '/raw-attachment/wiki/') or
-                                starts-with(@data, '/graphviz/')]">
-      <object data="{concat($basename, @data)}">
-        <xsl:apply-templates select="@*[name() != 'data']"/>
-        <xsl:apply-templates/>
-      </object>
-    </xsl:template>
-
-    <xsl:template match="embed[starts-with(@src, '/raw-attachment/wiki/') or
-                               starts-with(@src, '/graphviz/')]">
-      <embed src="{concat($basename, @src)}">
-        <xsl:apply-templates select="@*[name() != 'src']"/>
-        <xsl:apply-templates/>
-      </embed>
-    </xsl:template>
-
-    <xsl:template match="text()[contains(., '&#8203;')]">
-      <xsl:call-template name="remove-zero-width-spaces">
-        <xsl:with-param name="s" select="."/>
-      </xsl:call-template>
-    </xsl:template>
-
-    <xsl:template match="@*|node()">
-      <xsl:copy>
-        <xsl:copy-of select="@*"/>
-        <xsl:apply-templates/>
-      </xsl:copy>
-    </xsl:template>
-
-    <xsl:template name="path-to-id">
-      <xsl:param name="p"/>
-      <xsl:text>_</xsl:text>
-      <xsl:call-template name="replace">
-        <xsl:with-param name="s" select="$p"/>
-        <xsl:with-param name="old">/</xsl:with-param>
-        <xsl:with-param name="new">.</xsl:with-param>
-      </xsl:call-template>
-    </xsl:template>
-
-    <xsl:template name="remove-zero-width-spaces">
-      <xsl:param name="s"/>
-      <xsl:call-template name="replace">
-        <xsl:with-param name="s" select="$s"/>
-        <xsl:with-param name="old">&#8203;</xsl:with-param>
-        <xsl:with-param name="new"/>
-      </xsl:call-template>
-    </xsl:template>
-
-    <xsl:template name="replace">
-      <xsl:param name="s"/>
-      <xsl:param name="old"/>
-      <xsl:param name="new"/>
-      <xsl:choose>
-        <xsl:when test="contains($s, $old)">
-          <xsl:call-template name="replace">
-            <xsl:with-param name="s" select="concat(substring-before($s, $old),
-                                                    $new,
-                                                    substring-after($s, $old))"/>
-            <xsl:with-param name="old" select="$old"/>
-            <xsl:with-param name="new" select="$new"/>
-          </xsl:call-template>
-        </xsl:when>
-        <xsl:otherwise>
-          <xsl:value-of select="$s"/>
-        </xsl:otherwise>
-      </xsl:choose>
-    </xsl:template>
-
-    <xsl:template match="ol">
-      <xsl:if test="not(preceding-sibling::*[1]/self::ol)">
-        <xsl:variable name="following"
-                      select="following-sibling::ol[
-                                not(preceding-sibling::*[
-                                  not(self::ol) and
-                                  not(following-sibling::ol = current())
-                                ])
-                              ]"/>
-        <xsl:copy>
-          <xsl:apply-templates select="$following/@*[name() != 'start']"/>
-          <xsl:apply-templates select="@*"/>
-          <xsl:apply-templates select="node()"/>
-          <xsl:apply-templates select="$following/node()"/>
-        </xsl:copy>
-      </xsl:if>
-    </xsl:template>
-
-  </xsl:transform>
-'''))
-
-# All the files we want to parse are HTML, so make HTML the default
-# parser.  In theory the HTML produced by Trac is XHTML thus should
-# parse correctly (in fact, better) as XML, but in practice this seems
-# not to work properly at the moment, while parsing as HTML does.
-# Haven't bothered to figure out why, life is too short.
-#
-# If you're reading this comment because this script stopped working
-# after a Trac upgrade, try commenting out this line to see whether
-# things have changed and Trac's HTML now parses better as XML.
-
-lxml.etree.set_default_parser(lxml.etree.HTMLParser())
-
-# Run the main program.
-main()
-- 
cgit v1.2.3