From 3e7b8b209060988ed020f0eda33b1a2f7c292be7 Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Mon, 18 Mar 2019 20:44:57 +0000 Subject: Initial wiki dump and initial tools --- tools/convert-and-slurp-attachments.sh | 18 ++ tools/extract-wiki-content.xsl | 177 +++++++++++++++++ tools/rpki-wiki-to-markdown.py | 341 +++++++++++++++++++++++++++++++++ tools/trac-wiki-to-markdown.rb | 51 +++++ tools/trac2down.py | 61 ++++++ tools/trac2md.py | 192 +++++++++++++++++++ 6 files changed, 840 insertions(+) create mode 100755 tools/convert-and-slurp-attachments.sh create mode 100644 tools/extract-wiki-content.xsl create mode 100644 tools/rpki-wiki-to-markdown.py create mode 100644 tools/trac-wiki-to-markdown.rb create mode 100644 tools/trac2down.py create mode 100644 tools/trac2md.py diff --git a/tools/convert-and-slurp-attachments.sh b/tools/convert-and-slurp-attachments.sh new file mode 100755 index 0000000..ce7f34d --- /dev/null +++ b/tools/convert-and-slurp-attachments.sh @@ -0,0 +1,18 @@ +#!/bin/sh - + +ls | fgrep -v . | +while read page +do + base="https://trac.rpki.net" + path="/wiki/$(echo $page | sed s=%2F=/=g)" + + # Fetch the Wiki page, extract the useful portion of the HTML, convert that into Markdown + curl "${base}${path}" | + xsltproc --html extract-wiki-content.xsl - | + html2markdown --no-skip-internal-links --reference-links >"$page.md" + + # Fetch a ZIP file containing any attachments, clean up if result is empty or broken + curl "${base}/zip-attachment${path}/" >"$page.zip" + zipinfo "$page.zip" >/dev/null 2>&1 || rm -f "$page.zip" + +done diff --git a/tools/extract-wiki-content.xsl b/tools/extract-wiki-content.xsl new file mode 100644 index 0000000..e4376e8 --- /dev/null +++ b/tools/extract-wiki-content.xsl @@ -0,0 +1,177 @@ + + + + + + + + + + + + + + + + + + NEW PAGE + + +
+ +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + _ + + + / + . + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ diff --git a/tools/rpki-wiki-to-markdown.py b/tools/rpki-wiki-to-markdown.py new file mode 100644 index 0000000..dff87e6 --- /dev/null +++ b/tools/rpki-wiki-to-markdown.py @@ -0,0 +1,341 @@ +# Copyright (C) 2016 Parsons Government Services ("PARSONS") +# Portions copyright (C) 2014 Dragon Research Labs ("DRL") +# Portions copyright (C) 2012 Internet Systems Consortium ("ISC") +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notices and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND PARSONS, DRL, AND ISC DISCLAIM +# ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL +# PARSONS, DRL, OR ISC BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR +# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION +# WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +""" +Trac Wiki -> Markdown converter, hacked from old Trac Wiki -> PDF/flat +text converter. + +Pull HTML pages from a Trac Wiki, feed the useful bits to +html2text to generate Markdown. + +Assumes you're using the TracNav plugin for the Wiki pages, and uses +the same list as the TracNav plugin does to determine the set of pages +to convert. +""" + +# Dependencies, at least on Ubuntu Xenial: +# +# apt-get install python-lxml python-html2text +# +# Be warned that there are many unrelated packages named "html2text", +# installed under various names on various platforms. This one +# happens to be a useful HTML-to-Markdown converter. + +# Most of the work of massaging the HTML is done using XSL transforms, +# because the template-driven style makes that easy. There's probably +# some clever way to use lxml's XPath code to do the same thing in a +# more pythonic way with ElementTrees, but I already had the XSL +# transforms and there's a point of diminishing returns on this sort of +# thing. + +import sys +import os +import argparse +import lxml.etree +import urllib +import urlparse +import subprocess +import zipfile + +# Main program, up front so it doesn't get lost under all the XSL + +def main(): + + base = "https://trac.rpki.net" + + parser = argparse.ArgumentParser(description = __doc__, formatter_class = argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-b", "--base_url", + default = base, + help = "base URL for documentation web site") + parser.add_argument("-t", "--toc", + default = base + "/wiki/doc/RPKI/TOC", + help = "table of contents URL") + parser.add_argument("-d", "--directory", + default = ".", + help = "output directory") + parser.add_argument("-p", "--prefix", + default = "/wiki/doc", + help = "page name prefix on wiki") + args = parser.parse_args() + + urls = str(xsl_get_toc(lxml.etree.parse(urllib.urlopen(args.toc)).getroot(), + basename = repr(args.base_url))).splitlines() + + assert all(urlparse.urlparse(url).path.startswith(args.prefix) for url in urls) + + for pagenum, url in enumerate(urls): + path = urlparse.urlparse(url).path + page = xsl_get_page(lxml.etree.parse(urllib.urlopen(url)).getroot(), + basename = repr(args.base_url), + path = repr(path)) + + fn_base = os.path.join(args.directory, "{:02d}{}".format(pagenum, path[len(args.prefix):].replace("/", "."))) + + fn = fn_base + ".zip" + zip_url = urlparse.urljoin(url, "/zip-attachment{}/".format(path)) + urllib.urlretrieve(zip_url, fn) + with zipfile.ZipFile(fn, "r") as z: + if len(z.namelist()) == 0: + os.unlink(fn) + else: + sys.stderr.write("Wrote {}\n".format(fn)) + + for imgnum, img in enumerate(page.xpath("//img | //object | //embed")): + img_url = img.get("data" if img.tag == "object" else "src") + img_url = urlparse.urljoin(url, img_url) + fn = "{}.{:02d}{}".format(fn_base, imgnum, os.path.splitext(img_url)[1]) + urllib.urlretrieve(img_url, fn) + sys.stderr.write("Wrote {}\n".format(fn)) + + html2markdown = subprocess.Popen(("html2markdown", "--no-skip-internal-links", "--reference-links"), + stdin = subprocess.PIPE, stdout = subprocess.PIPE) + page.write(html2markdown.stdin) + html2markdown.stdin.close() + lines = html2markdown.stdout.readlines() + html2markdown.stdout.close() + html2markdown.wait() + + while lines and lines[0].isspace(): + del lines[0] + + fn = fn_base + ".md" + with open(fn, "w") as f: + want_blank = False + for line in lines: + blank = line.isspace() + if want_blank and not blank: + f.write("\n") + if not blank: + f.write(line) + want_blank = blank + sys.stderr.write("Wrote {}\n".format(fn)) + + fn = fn[:-3] + ".wiki" + urllib.urlretrieve(url + "?format=txt", fn) + sys.stderr.write("Wrote {}\n".format(fn)) + + +# XSL transform to extract list of Wiki page URLs from the TOC Wiki page + +xsl_get_toc = lxml.etree.XSLT(lxml.etree.XML('''\ + + + + + + + + + + + + + +''')) + +# XSL transform to extract useful content of a Wiki page. + +# Django generates weird HTML for ordered lists: it sometimes breaks +# up a single ordered list into multiple adjacent
    elements, +# using the @start attribute to try to make the result look like a +# single ordered list. This looks OK in Firefox but confuses the +# bejesus out of both html2markdown and htmldoc. In some cases this is +# probably unavoidable, but most of the uses of this I've seen look +# gratuitous, and are probably the result of code modulararity issues +# in Django. +# +# So we try to clean this up, by merging adjacent
      elements where +# we can. The merge incantation is an adaptation of: +# +# http://stackoverflow.com/questions/1806123/merging-adjacent-nodes-of-same-type-xslt-1-0 +# +# There may be a more efficient way to do this, but I don't think +# we care, and this seems to work. +# +# Original author's explanation: +# +# The rather convoluted XPath expression for selecting the following +# sibling aaa nodes which are merged with the current one: +# +# following-sibling::aaa[ # following 'aaa' siblings +# not(preceding-sibling::*[ # if they are not preceded by +# not(self::aaa) and # a non-'aaa' node +# not(following-sibling::aaa = current()) # after the current node +# ]) +# ] + +xsl_get_page = lxml.etree.XSLT(lxml.etree.XML('''\ + + + + + + + + + + + + + + + + NEW PAGE + + +
      + +
      + + +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + _ + + + / + . + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +''')) + +# All the files we want to parse are HTML, so make HTML the default +# parser. In theory the HTML produced by Trac is XHTML thus should +# parse correctly (in fact, better) as XML, but in practice this seems +# not to work properly at the moment, while parsing as HTML does. +# Haven't bothered to figure out why, life is too short. +# +# If you're reading this comment because this script stopped working +# after a Trac upgrade, try commenting out this line to see whether +# things have changed and Trac's HTML now parses better as XML. + +lxml.etree.set_default_parser(lxml.etree.HTMLParser()) + +# Run the main program. +main() diff --git a/tools/trac-wiki-to-markdown.rb b/tools/trac-wiki-to-markdown.rb new file mode 100644 index 0000000..f7d41ae --- /dev/null +++ b/tools/trac-wiki-to-markdown.rb @@ -0,0 +1,51 @@ +# Untested code snippet from https://gist.github.com/somebox/619537 + +class String + def trac_to_markdown! + gsub!(/\{\{\{([^\n]+?)\}\}\}/, '`\1`') + gsub!(/\{\{\{(.+?)\}\}\}/m){|m| m.each_line.map{|x| "\t#{x}".gsub(/[\{\}]{3}/,'')}.join} + gsub!(/\=\=\=\=\s(.+?)\s\=\=\=\=/, '### \1') + gsub!(/\=\=\=\s(.+?)\s\=\=\=/, '## \1') + gsub!(/\=\=\s(.+?)\s\=\=/, '# \1') + gsub!(/\=\s(.+?)\s\=[\s\n]*/, '') + gsub!(/\[(http[^\s\[\]]+)\s([^\[\]]+)\]/, '[\2](\1)') + gsub!(/\!(([A-Z][a-z0-9]+){2,})/, '\1') + gsub!(/'''(.+)'''/, '*\1*') + gsub!(/''(.+)''/, '_\1_') + gsub!(/^\s\*/, '*') + gsub!(/^\s\d\./, '1.') + + gsub!(/\{\{\{([^\n]+?)\}\}\}/, '`\1`') + gsub!(/'''(.+?)'''/, '**\1**') + gsub!(/''(.+?)''/, '*\1*') + gsub!(/((^\|\|[^\n\r]+\|\|[ \t]*\r?(\n|$))+)/m) do |m| + m = m.each_line.map do |x| + x.gsub(/\t/, ' ') + .gsub(/(\|\|){2,}/){|k| k.gsub(/\|\|/, '|| ')} + .gsub(/ {3,}/, ' ') + end.join + lines = m.each_line.to_a + line1 = lines.shift + line2 = line1.dup.gsub(/[^\n\r\|]/, '-') + lines.unshift(line1, line2) + c = lines.join + c = c.each_line.map do |x| + x.gsub(/\=\s?(.+?)\s?=/, ' \1 ') + .gsub(/\|\|/, '|') + end.join + end + gsub!(/^\{\{\{(.+?)^\}\}\}/m, '```\1```') + gsub!(/\=\=\=\=\s(.+?)\s\=\=\=\=/, '### \1') + gsub!(/\=\=\=\s(.+?)\s\=\=\=/, '## \1') + gsub!(/\=\=\s(.+?)\s\=\=/, '# \1') + gsub!(/\=\s(.+?)\s\=[\s\n]*/, '') + gsub!(/\[(http[^\s\[\]]+)\s([^\[\]]+)\]/, '[\2](\1)') + gsub!(/\!(([A-Z][a-z0-9]+){2,})/, '\1') + gsub!(/^\s\*/, '*') + gsub!(/^\s\d\./, '1.') + end +end + +some_trac = 'my document' + +puts some_trac.trac_to_markdown! diff --git a/tools/trac2down.py b/tools/trac2down.py new file mode 100644 index 0000000..5bb9094 --- /dev/null +++ b/tools/trac2down.py @@ -0,0 +1,61 @@ +#!/usr/bin/python + +# Untested code from https://gist.githubusercontent.com/sgk/1286682/raw/b744dd2e47a68d60373ad39df87cfe8256f517af/trac2down.py + +# vim:set fileencoding=utf-8 sw=2 ai: + +import sqlite3 +import datetime +import re + +SQL = ''' + select + name, version, time, author, text + from + wiki w + where + version = (select max(version) from wiki where name = w.name) +''' + +conn = sqlite3.connect('../trac.db') +result = conn.execute(SQL) +for row in result: + name = row[0] + version = row[1] + time = row[2] + author = row[3] + text = row[4] + + text = re.sub('\r\n', '\n', text) + text = re.sub(r'{{{(.*?)}}}', r'`\1`', text) + def indent4(m): + return '\n ' + m.group(1).replace('\n', '\n ') + text = re.sub(r'(?sm){{{\n(.*?)\n}}}', indent4, text) + text = re.sub(r'(?m)^====\s+(.*?)\s+====$', r'#### \1', text) + text = re.sub(r'(?m)^===\s+(.*?)\s+===$', r'### \1', text) + text = re.sub(r'(?m)^==\s+(.*?)\s+==$', r'## \1', text) + text = re.sub(r'(?m)^=\s+(.*?)\s+=$', r'# \1', text) + text = re.sub(r'^ * ', r'****', text) + text = re.sub(r'^ * ', r'***', text) + text = re.sub(r'^ * ', r'**', text) + text = re.sub(r'^ * ', r'*', text) + text = re.sub(r'^ \d+. ', r'1.', text) + + a = [] + for line in text.split('\n'): + if not line.startswith(' '): + line = re.sub(r'\[(https?://[^\s\[\]]+)\s([^\[\]]+)\]', r'[\2](\1)', line) + line = re.sub(r'\[(wiki:[^\s\[\]]+)\s([^\[\]]+)\]', r'[\2](/\1/)', line) + line = re.sub(r'\!(([A-Z][a-z0-9]+){2,})', r'\1', line) + line = re.sub(r'\'\'\'(.*?)\'\'\'', r'*\1*', line) + line = re.sub(r'\'\'(.*?)\'\'', r'_\1_', line) + a.append(line) + text = '\n'.join(a) + + fp = file('%s.md' % name, 'w') + print >>fp, '' % name + print >>fp, '' % version + print >>fp, '' % datetime.datetime.fromtimestamp(time).strftime('%Y/%m/%d %H:%M:%S') + print >>fp, '' % author + fp.write(text.encode('utf-8')) + fp.close() diff --git a/tools/trac2md.py b/tools/trac2md.py new file mode 100644 index 0000000..40c09d4 --- /dev/null +++ b/tools/trac2md.py @@ -0,0 +1,192 @@ +#!/usr/bin/python + +# Untested code from https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/ + +# This code mostly taken from patches to pagure_importer by mreynolds + +import sys +import re +import time +import requests +import shutil +import os +from base64 import b64decode +from datetime import datetime + +wikilink_pattern = re.compile('\[http(.*)\]') +wikilink_extract = re.compile('\[(.*)\]') +wikiheading1_pattern = re.compile('^= (.*) =$') +wikiheading2_pattern = re.compile('^== (.*) ==$') +wikiheading3_pattern = re.compile('^=== (.*) ===$') +strikethrough_pattern = re.compile('~~(.*)~~') + +def to_timestamp(tm): + ''' Convert to timestamp which can be jsonified ''' + + tm = tm.replace('+00:00', '') + date = datetime.strptime(tm, '%Y-%m-%dT%H:%M:%S') + ts = str(time.mktime(date.timetuple()))[:-2] # Strip the .0 + return ts + + +def strip_wikilink(content): + ''' Need to remove wiki link format from custom fields. They come in a + variety of forms that can be comma or whitespace separated. They can also + include link names which must also be removed. + + [https://bugzilla.redhat.com/show_bug.cgi?id=772777] + [https://bugzilla.com/123456789], [http://bugzilla.com/7777777 7777777] + [https://bugzilla.com/6666666 6666666] + ''' + + links = [] + if wikilink_pattern.search(content): + # Looks like we have a link in here + links = [] + mylist = re.findall(r'\[([^]]*)\]', content) + for i in mylist: + links.append(i.split(' ', 1)[0]) + return ', '.join(links) + else: + return content + + +def convert_headers(line): + ''' Convert wikiformat headers + ''' + level_count = 1 + for header in [wikiheading1_pattern, + wikiheading2_pattern, + wikiheading3_pattern]: + try: + level = header.search(line).group(1) + if level: + line = "%s %s" % ('#' * level_count, level) + break # No need to check other heading levels + except: + # Try the next heading level + pass + level_count += 1 + + return line + + +def convert_wikilinks(line): + ''' Convert wikiformat links + ''' + if wikilink_pattern.search(line): + try: + result = wikilink_extract.search(line).group(1) + if result: + parts = result.split(' ', 1) + if len(parts) == 1: + mdlink = '[%s](%s)' % (parts[0], parts[0]) + elif len(parts) == 2: + mdlink = '[%s](%s)' % (parts[1], parts[0]) + line = line.replace('[' + result + ']', mdlink) + except: + # Not a link, not a problem + pass + + return line + + +def convert_strike(line): + ''' Convert wikiformat striked text + ''' + striked_result = strikethrough_pattern.search(line) + if striked_result: + try: + striked_text = striked_result.group(1) + if striked_text: + orig_text = '~~%s~~' % striked_text + new_text = '%s' % striked_text + line = line.replace(orig_text, new_text) + except: + # Not striked + pass + return line + +def WikiToMD(content): + ''' Convert wiki/RST format to Markdown. Code blocks, bold/italics, + wiki links, lists, striked text, and headers. ''' + + code_block = False + in_list = False + nested_level = 0 + prev_indent = 0 + new_content = "" + + for line in content.split('\n'): + line = line.replace("\r", "") + if "{{{" in line: + code_block = True + line = line.replace("{{{", "```") + if "}}}" in line: + code_block = False + line = line.replace("}}}", "```") + if not code_block: + # + # Convert bullet lists. The start and end of a list needs + # an empty line. wikiformat uses both '*' and '-' for its + # lists. However, markdown only supports '-'. + # + if line.startswith('* '): + if not in_list: + new_content = "%s\n" % (new_content) + in_list = True + line = line[1:] + line = '-%s' % (line) + elif line.startswith('- '): + # No need to modify the line, just add the new line + if not in_list: + new_content = "%s\n" % (new_content) + in_list = True + elif line.startswith(' '): + # Check for nested lists + nested_line = line.lstrip(' ') + if nested_line.startswith('* ') or \ + nested_line.startswith('- '): + # Adjust the nested list level as needed + indent = len(line) - len(nested_line) + if indent > prev_indent: + nested_level += 1 + elif indent < prev_indent: + nested_level -= 1 + prev_indent = indent + + # Set the proper indentation for markdown + line = ('%s-%s' % (' ' * nested_level, + nested_line[1:])) + else: + if in_list: + # Add the closing empty line + new_content = "%s\n" % (new_content) + in_list = False + nested_level = 0 + prev_indent = 0 + + # Convert headers + line = convert_headers(line) + + # Convert wiki links + line = convert_wikilinks(line) + + # Convert striked through text + line = convert_strike(line) + + # Convert bold and italic text (do this last) + line = line.replace("'''", "**") # Convert bold text + line = line.replace("''", "*") # Convert italic text + + new_content = "%s%s\n" % (new_content, line) + + return new_content + +for f in sys.argv[1:]: + d = WikiToMD(open(f, "r").read()) + newf = f.replace(".trac", ".md") + with open(newf, "w") as fp: + fp.write(d) + pass + pass -- cgit v1.2.3