From 68e18ad1f44e9a6fab66adc38e97d027a58de8a4 Mon Sep 17 00:00:00 2001 From: Rob Austein Date: Sun, 14 Feb 2021 16:56:57 +0000 Subject: Another reorg, and pelican samples --- references/convert-and-slurp-attachments.sh | 18 ++ references/extract-wiki-content.xsl | 177 +++++++++++++++ references/generate-json.py | 154 +++++++++++++ references/pelicanconf.py | 42 ++++ references/publishconf.py | 19 ++ references/rpki-wiki-to-markdown.py | 341 ++++++++++++++++++++++++++++ references/schema.sql | 177 +++++++++++++++ references/trac-wiki-to-markdown.rb | 51 +++++ references/trac2down.py | 61 +++++ 9 files changed, 1040 insertions(+) create mode 100755 references/convert-and-slurp-attachments.sh create mode 100644 references/extract-wiki-content.xsl create mode 100755 references/generate-json.py create mode 100644 references/pelicanconf.py create mode 100644 references/publishconf.py create mode 100644 references/rpki-wiki-to-markdown.py create mode 100644 references/schema.sql create mode 100644 references/trac-wiki-to-markdown.rb create mode 100644 references/trac2down.py (limited to 'references') diff --git a/references/convert-and-slurp-attachments.sh b/references/convert-and-slurp-attachments.sh new file mode 100755 index 0000000..ce7f34d --- /dev/null +++ b/references/convert-and-slurp-attachments.sh @@ -0,0 +1,18 @@ +#!/bin/sh - + +ls | fgrep -v . | +while read page +do + base="https://trac.rpki.net" + path="/wiki/$(echo $page | sed s=%2F=/=g)" + + # Fetch the Wiki page, extract the useful portion of the HTML, convert that into Markdown + curl "${base}${path}" | + xsltproc --html extract-wiki-content.xsl - | + html2markdown --no-skip-internal-links --reference-links >"$page.md" + + # Fetch a ZIP file containing any attachments, clean up if result is empty or broken + curl "${base}/zip-attachment${path}/" >"$page.zip" + zipinfo "$page.zip" >/dev/null 2>&1 || rm -f "$page.zip" + +done diff --git a/references/extract-wiki-content.xsl b/references/extract-wiki-content.xsl new file mode 100644 index 0000000..e4376e8 --- /dev/null +++ b/references/extract-wiki-content.xsl @@ -0,0 +1,177 @@ + + + + + + + + + + +

+ + + + + +

+ NEW PAGE + + +

+ +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+ + + + + + +

+ + + +

+ + + + + + + + + + + + + + + + + + + + + _ + + + / + . + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/references/generate-json.py b/references/generate-json.py new file mode 100755 index 0000000..b8b1f38 --- /dev/null +++ b/references/generate-json.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python + +# Generate JSON to import Trac tickets into GitHub issues using the new import API +# described at https://gist.github.com/jonmagic/5282384165e0f86ef105 + +import os +import time +import json +import yaml +import sqlite3 +import hashlib +import argparse +import subprocess + +ticket_query = ''' +SELECT + id, + type, + owner, + reporter, + milestone, + status, + resolution, + summary, + description, + component, + priority, + time / 1000000 AS createdtime, + changetime / 1000000 AS modifiedtime +FROM + ticket +ORDER BY + id +''' + +comment_query = ''' +SELECT + time / 1000000 AS createdtime, + author, + newvalue +FROM + ticket_change +WHERE + ticket = ? +AND + field = 'comment' +AND + newvalue <> '' +ORDER BY + time +''' + +attachment_query = ''' +SELECT + id, + filename, + size, + author, + description, + ipnr, + time / 1000000 AS createdtime +FROM + attachment +WHERE + id = ? +AND + type = 'ticket' +ORDER BY + time, filename +''' + +def isotime(t): + return None if t == 0 else time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(t)) + +def hashname(whatever): + return hashlib.sha1(unicode(whatever)).hexdigest() + +def ticket_text(ticket): + d = dict(ticket, createdtime = isotime(ticket["createdtime"]), modifiedtime = isotime(ticket["modifiedtime"])) + return u"{description}\n\n" \ + u"_Trac ticket #{id} component {component} priority {priority}, owner {owner}," \ + u" created by {reporter} on {createdtime}, last modified {modifiedtime}_\n".format(**d) + +def comment_text(comment): + d = dict(comment, createdtime = isotime(comment["createdtime"])) + return u"{newvalue}\n\n_Trac comment by {author} on {createdtime}_\n".format(**d) + +def attachment_text(attachment): + h1 = hashname(attachment["id"]) + h2 = hashname(attachment["filename"]) + fn2 = os.path.splitext(attachment["filename"])[1] + fn = os.path.join(gist_url, h1[:3], h1, h2 + fn2) + url = "{}/raw/{}/ticket.{}.{}{}".format(gist_url.rstrip("/"), gist_commit, h1, h2, fn2) + d = dict(attachment, createdtime = isotime(comment["createdtime"]), url = url) + return u"[{filename}]({url}) {description}\n_Trac attachment by {author} on {createdtime}_\n".format(**d) + +def comment_merge(comments, attachments): + result = [] + while comments and attachments: + result.append(comments.pop(0) if comments[0]["created_at"] <= attachments[0]["created_at"] else attachments.pop(0)) + return result + comments + attachments + +parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("-c", "--config", type = argparse.FileType(), + default = "generate-json.yaml", + help = "YAML config mappings") +args = parser.parse_args() + +cfg = yaml.safe_load(args.config) +assignee_map = cfg["assignees"] +type_map = cfg["type_labels"] +resolution_map = cfg["resolution_labels"] + +gist_url = cfg.get("attachment_gist_url") +if gist_url is not None: + gist_commit = subprocess.check_output(("git", "ls-remote", gist_url, "HEAD")).split()[0] + +db = sqlite3.connect(cfg["database"]) +db.row_factory = sqlite3.Row +ticket_cursor = db.cursor() +comment_cursor = db.cursor() +attachment_cursor = db.cursor() + +if not os.path.isdir(cfg["ticket_directory"]): + os.makedirs(cfg["ticket_directory"]) + +for ticket in ticket_cursor.execute(ticket_query): + comments = comment_merge([dict(created_at = isotime(comment["createdtime"]), body = comment_text(comment)) + for comment in comment_cursor.execute(comment_query, (ticket["id"],))], + [] if gist_url is None else + [dict(created_at = isotime(attachment["createdtime"]), body = attachment_text(attachment)) + for attachment in attachment_cursor.execute(attachment_query, (ticket["id"],))]) + issue = dict( + title = ticket["summary"], + body = ticket_text(ticket), + created_at = isotime(ticket["createdtime"]), + updated_at = isotime(ticket["modifiedtime"])) + if ticket["status"] == "closed": + issue["closed"] = True + issue["closed_at"] = isotime(ticket["modifiedtime"]) + comments.append(dict(created_at = isotime(ticket["modifiedtime"]), + body = "_Closed with resolution {resolution}_\n".format(**ticket))) + if ticket["owner"] in assignee_map: + issue["assignee"] = assignee_map[ticket["owner"]] + labels = [type_map.get(ticket["type"]), resolution_map.get(ticket["resolution"])] + while None in labels: + del labels[labels.index(None)] + if labels: + issue["labels"] = labels + issue = dict(issue = issue) + if comments: + issue["comments"] = comments + with open(os.path.join(cfg["ticket_directory"], "ticket_{:03d}.json".format(ticket["id"])), "wb") as f: + json.dump(issue, f, indent = 4, sort_keys = True, separators=(",", ": ")) diff --git a/references/pelicanconf.py b/references/pelicanconf.py new file mode 100644 index 0000000..a28721d --- /dev/null +++ b/references/pelicanconf.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- # +from __future__ import unicode_literals + +AUTHOR = u'Rob Austein' +SITENAME = u'Your Bug Report Will Be Graded' + +# Apparently this is much longer than theme designer expected. Skip it for now. +SITESUBTITLE = u'"I\'m not proud of being a congenital pain in the ass. But I will take money for it."' + +PATH = 'content' +TIMEZONE = 'UTC' +DEFAULT_LANG = u'English' + +# Hack article URLs to match what Blogofile did, to avoid breaking links. + +ARTICLE_URL = '{date:%Y}/{date:%m}/{date:%d}/{slug}/' +ARTICLE_SAVE_AS = '{date:%Y}/{date:%m}/{date:%d}/{slug}/index.html' + +# Feed generation is usually not desired when developing +SITEURL = '' +RELATIVE_URLS = True +FEED_ALL_ATOM = None +CATEGORY_FEED_ATOM = None +TRANSLATION_FEED_ATOM = None +AUTHOR_FEED_ATOM = None +AUTHOR_FEED_RSS = None + +# Blogroll +LINKS = (('Pelican', 'http://getpelican.com/'), + ('Python.org', 'http://python.org/'), + ('Jinja2', 'http://jinja.pocoo.org/')) +LINKS_WIDGET_NAME = "Links" + +# Social widget. Can't get rid of this with default theme, only change its name. +# Fiddle with themes later +SOCIAL = () +SOCIAL_WIDGET_NAME = "Subscribe" + +DEFAULT_PAGINATION = 10 + +THEME = "/home/blog/pelican-themes/sundown" diff --git a/references/publishconf.py b/references/publishconf.py new file mode 100644 index 0000000..f0fb21d --- /dev/null +++ b/references/publishconf.py @@ -0,0 +1,19 @@ +#!/usr/local/bin/python2.7 +# -*- coding: utf-8 -*- # +from __future__ import unicode_literals + +# This file is only used if you use `make publish` or +# explicitly specify it as your config file. + +import os +import sys +sys.path.append(os.curdir) +from pelicanconf import * + +SITEURL = 'https://www.hactrn.net/blog' +RELATIVE_URLS = False + +FEED_ALL_ATOM = 'feeds/all.atom.xml' +CATEGORY_FEED_ATOM = 'feeds/{slug}.atom.xml' + +DELETE_OUTPUT_DIRECTORY = True diff --git a/references/rpki-wiki-to-markdown.py b/references/rpki-wiki-to-markdown.py new file mode 100644 index 0000000..dff87e6 --- /dev/null +++ b/references/rpki-wiki-to-markdown.py @@ -0,0 +1,341 @@ +# Copyright (C) 2016 Parsons Government Services ("PARSONS") +# Portions copyright (C) 2014 Dragon Research Labs ("DRL") +# Portions copyright (C) 2012 Internet Systems Consortium ("ISC") +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notices and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND PARSONS, DRL, AND ISC DISCLAIM +# ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL +# PARSONS, DRL, OR ISC BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR +# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION +# WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +""" +Trac Wiki -> Markdown converter, hacked from old Trac Wiki -> PDF/flat +text converter. + +Pull HTML pages from a Trac Wiki, feed the useful bits to +html2text to generate Markdown. + +Assumes you're using the TracNav plugin for the Wiki pages, and uses +the same list as the TracNav plugin does to determine the set of pages +to convert. +""" + +# Dependencies, at least on Ubuntu Xenial: +# +# apt-get install python-lxml python-html2text +# +# Be warned that there are many unrelated packages named "html2text", +# installed under various names on various platforms. This one +# happens to be a useful HTML-to-Markdown converter. + +# Most of the work of massaging the HTML is done using XSL transforms, +# because the template-driven style makes that easy. There's probably +# some clever way to use lxml's XPath code to do the same thing in a +# more pythonic way with ElementTrees, but I already had the XSL +# transforms and there's a point of diminishing returns on this sort of +# thing. + +import sys +import os +import argparse +import lxml.etree +import urllib +import urlparse +import subprocess +import zipfile + +# Main program, up front so it doesn't get lost under all the XSL + +def main(): + + base = "https://trac.rpki.net" + + parser = argparse.ArgumentParser(description = __doc__, formatter_class = argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-b", "--base_url", + default = base, + help = "base URL for documentation web site") + parser.add_argument("-t", "--toc", + default = base + "/wiki/doc/RPKI/TOC", + help = "table of contents URL") + parser.add_argument("-d", "--directory", + default = ".", + help = "output directory") + parser.add_argument("-p", "--prefix", + default = "/wiki/doc", + help = "page name prefix on wiki") + args = parser.parse_args() + + urls = str(xsl_get_toc(lxml.etree.parse(urllib.urlopen(args.toc)).getroot(), + basename = repr(args.base_url))).splitlines() + + assert all(urlparse.urlparse(url).path.startswith(args.prefix) for url in urls) + + for pagenum, url in enumerate(urls): + path = urlparse.urlparse(url).path + page = xsl_get_page(lxml.etree.parse(urllib.urlopen(url)).getroot(), + basename = repr(args.base_url), + path = repr(path)) + + fn_base = os.path.join(args.directory, "{:02d}{}".format(pagenum, path[len(args.prefix):].replace("/", "."))) + + fn = fn_base + ".zip" + zip_url = urlparse.urljoin(url, "/zip-attachment{}/".format(path)) + urllib.urlretrieve(zip_url, fn) + with zipfile.ZipFile(fn, "r") as z: + if len(z.namelist()) == 0: + os.unlink(fn) + else: + sys.stderr.write("Wrote {}\n".format(fn)) + + for imgnum, img in enumerate(page.xpath("//img | //object | //embed")): + img_url = img.get("data" if img.tag == "object" else "src") + img_url = urlparse.urljoin(url, img_url) + fn = "{}.{:02d}{}".format(fn_base, imgnum, os.path.splitext(img_url)[1]) + urllib.urlretrieve(img_url, fn) + sys.stderr.write("Wrote {}\n".format(fn)) + + html2markdown = subprocess.Popen(("html2markdown", "--no-skip-internal-links", "--reference-links"), + stdin = subprocess.PIPE, stdout = subprocess.PIPE) + page.write(html2markdown.stdin) + html2markdown.stdin.close() + lines = html2markdown.stdout.readlines() + html2markdown.stdout.close() + html2markdown.wait() + + while lines and lines[0].isspace(): + del lines[0] + + fn = fn_base + ".md" + with open(fn, "w") as f: + want_blank = False + for line in lines: + blank = line.isspace() + if want_blank and not blank: + f.write("\n") + if not blank: + f.write(line) + want_blank = blank + sys.stderr.write("Wrote {}\n".format(fn)) + + fn = fn[:-3] + ".wiki" + urllib.urlretrieve(url + "?format=txt", fn) + sys.stderr.write("Wrote {}\n".format(fn)) + + +# XSL transform to extract list of Wiki page URLs from the TOC Wiki page + +xsl_get_toc = lxml.etree.XSLT(lxml.etree.XML('''\ + + + + + + + + + + + + + +''')) + +# XSL transform to extract useful content of a Wiki page. + +# Django generates weird HTML for ordered lists: it sometimes breaks +# up a single ordered list into multiple adjacent

+ + + + + + + +

+ + + + + +

+ NEW PAGE + + +

+ +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+ + + + + + +

+ + + +

+ + + + + + + + + + + + + + + + + + + + + _ + + + / + . + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +