diff options
author | Rob Austein <sra@hactrn.net> | 2021-02-16 06:46:38 +0000 |
---|---|---|
committer | Rob Austein <sra@hactrn.net> | 2021-02-16 06:46:38 +0000 |
commit | a4a027a6700e1197a50b24bdfbde1697049b7348 (patch) | |
tree | 6d0c71cdb020bb7b6725414ed5ca50a97d5f20e2 | |
parent | 61a631cbcf8c584dd8adde13d26d005eb4675935 (diff) |
Refactor Wiki2Markdown as class
Makes portions of the mess clearer, not as much as I'd hoped.
With a bit more work we could fold header-crunching into the existing
regexp mechanism, but code blocks, links, and tables require state.
-rwxr-xr-x | extract.py | 4 | ||||
-rwxr-xr-x | trac2md.py | 335 |
2 files changed, 168 insertions, 171 deletions
@@ -92,6 +92,8 @@ def main(): os.link("pelicanconf.py", "pelican/pelicanconf.py") + wiki_to_markdown = trac2md.Trac2Markdown() + keep = Filter() first_published = {} @@ -105,7 +107,7 @@ def main(): #print(slug, row.version) with open("wiki/{}.trac".format(slug), "w") as f: f.write(row.text) - md = markdown_header(row, first_published) + trac2md.WikiToMD(row.text, slug) + md = markdown_header(row, first_published) + wiki_to_markdown(row.text, slug) with open("pelican/content/{}.md".format(slug), "w") as f: f.write(md) @@ -5,192 +5,187 @@ import re from urllib.parse import quote -content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I) - -traclink_pattern = re.compile(r"(?<!\[)\[([^][]+)\]") - -image_pattern = re.compile(r"\[\[Image\((.*)\)\]\]") - -wikilink_pattern = re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]") - -strikethrough_pattern = re.compile(r"~~([^~]+)~~") -bangquote_pattern = re.compile(r"!((?:\w|[#])+)") -linebreak_pattern = re.compile(r"\\\\$") - -camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))") - -span_pattern = re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]") - -delete_pattern = re.compile(r"\[\[PageOutline\]\]", re.I) - -wikiheading_patterns = tuple( - (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level))) - for level in range(1, 7)) - - -def convert_headers(line): - for level_count, header in wikiheading_patterns: - try: - level = header.search(line).group(1) - if level: - line = "{} {}".format('#' * level_count, level.rstrip("= \r\t")) - break # No need to check other heading levels - except: - pass # Try the next heading level - return line - - -def convert_traclink_to_creolelink(m): - # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format. - # Creole's is easier to parse and harder to confuse with partially converted Markdown. - - text = m.group(1).strip() - if " " in text: - return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1)) - elif ":" in text or camelcase_pattern.match(text): - return "[[{}]]".format(text) - else: - return m.group(0) - - -# Probably most of the non-wiki scheme tests should become a table in an -# extended JSON config file which maps -# -# { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" } - -def convert_wikilinks(m, slug, giturl): - scheme, link, text = [p.strip() if p else p for p in m.groups()] - if text is None: - text = link - if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")): - link = link[1:-1] - if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")): - text = text[1:-1] - if text == link and link.startswith("http") and "://" in link: - return "<{}>".format(link) - elif scheme == "attachment:": - return "[{}]({{attach}}{}/{})".format(text, slug, link) - elif scheme in ("source:", "browser:"): - return "[{}]({}/{})".format(text, giturl.rstrip("/"), link.lstrip("/")) - elif scheme == "wiki:" or (scheme is None and camelcase_pattern.match(link)): - return "[{}]({{filename}}{}.md)".format(text, link) - else: - return "[{}]({})".format(text, link) - - -def convert_image(m, slug): - text = m.group(1).split(",")[0].strip() - if "://" in text: - return "<img src=\"{}\">".format(text) - else: - return "![{}]({{attach}}{}/{})".format(text, slug, quote(text, "")) - - -def WikiToMD(content, slug): - - code_block = False - in_list = False - in_table = False - nested_level = 0 - prev_indent = 0 - old_content = content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines() - new_content = [] - - while old_content: - line = old_content.pop(0).rstrip() - tail = ["\n"] - while "{{{" in line or "}}}" in line: - if "{{{" in line: - code_block = True - line = line.replace("{{{", "```") - if "}}}" in line: - code_block = False - line = line.replace("}}}", "```") - if not code_block: +class Trac2Markdown: + + content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I) + camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))") + + wikiheading_patterns = tuple( + (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level))) + for level in range(1, 7) + ) + + def convert_headers(self, line): + for level_count, header in self.wikiheading_patterns: + try: + level = header.search(line).group(1) + if level: + line = "{} {}".format('#' * level_count, level.rstrip("= \r\t")) + break # No need to check other heading levels + except: + pass # Try the next heading level + return line + + def convert_to_creole(self, m): + # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format. + # Creole's is easier to parse and harder to confuse with partially converted Markdown. + + text = m.group(1).strip() + if " " in text: + return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1)) + elif ":" in text or self.camelcase_pattern.match(text): + return "[[{}]]".format(text) + else: + return m.group(0) + + # Probably most of the non-wiki scheme tests should become a table in an + # extended JSON config file which maps + # + # { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" } + + def convert_wikilinks(self, m): + scheme, link, text = [p.strip() if p else p for p in m.groups()] + if text is None: + text = link + if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")): + link = link[1:-1] + if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")): + text = text[1:-1] + if text == link and link.startswith("http") and "://" in link: + return "<{}>".format(link) + elif scheme == "attachment:": + return "[{}]({{attach}}{}/{})".format(text, self.slug, link) + elif scheme in ("source:", "browser:"): + return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/")) + elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)): + return "[{}]({{filename}}{}.md)".format(text, link) + else: + return "[{}]({})".format(text, link) + + def convert_image(self, m): + text = m.group(1).split(",")[0].strip() + if "://" in text: + return "<img src=\"{}\">".format(text) + else: + return "![{}]({{attach}}{}/{})".format(text, self.slug, quote(text, "")) + + def __init__(self, source_url = "https://git.cryptech.is/"): + self.source_url = source_url + self.pattern_actions = ( # Convert CamelCase links to explicit links - line = camelcase_pattern.sub(r"[[\1]]", line) - - # Convert TracLinks to WikiCreole links to simplify remaining processing - line = traclink_pattern.sub(convert_traclink_to_creolelink, line) - - # Convert tables. References: - # https://github.github.com/gfm/#tables-extension- - # https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables - # Table start: line containing "||"; table end: blank line? - # - # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does. Guess we can - # add a dummy header if no better idea. Markdown requires delimiter line, which we add immediately after the - # header, both appear to be mandatory. Trac can have label cells anywhere, not just in header, might need to - # add "*" to those or just ignore the issue. Justification we can sort of figure out from the header, - # if the rows do anything different, ouch, because markdown specifies in delimiter line. - # - # Might do something clever with the "=" markers and alignment, start with just getting the basic table - # structure to something markdown will believe. - - if line.strip().startswith("||"): - line = line.replace("=|", "|").replace("|=", "|") - line = line.replace("||", "|") - if not in_table: - tail.append("|---" * (line.count("|") - 1) + "|\n") - in_table = True - elif in_table and not line.strip().startswith("||"): - new_content.append("\n") - in_table = False - - # - # Convert bullet lists. The start and end of a list needs an empty line. - # - nested_line = line.lstrip(' ') - if nested_line.startswith('- ') or nested_line.startswith('* '): - if not in_list: - new_content.append("\n") - nested_level = 0 - prev_indent = 0 - in_list = True - indent = len(line) - len(nested_line) - if indent > prev_indent: - nested_level += 1 - elif indent < prev_indent: - nested_level -= 1 - prev_indent = indent - line = ' ' * nested_level + nested_line - elif in_list: - new_content.append("\n") - in_list = False - nested_level = 0 - prev_indent = 0 + (self.camelcase_pattern, r"[[\1]]"), + + # Convert TracLinks to WikiCreole syntax to simplify remaining processing + (re.compile(r"(?<!\[)\[([^][]+)\]"), self.convert_to_creole), # Convert !x quoting - line = bangquote_pattern.sub(r"\1", line) + (re.compile(r"!((?:\w|[#])+)"), r"\1"), # Convert (limited subset of) spans - line = span_pattern.sub(r"\1", line) - - # Convert headers - line = convert_headers(line) + (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"), r"\1"), # Convert images - line = image_pattern.sub(lambda m: convert_image(m, slug), line) + (re.compile(r"\[\[Image\((.*)\)\]\]"), self.convert_image), # Delete Trac macros that have no useful counterpart - line = delete_pattern.sub("", line) + (re.compile(r"\[\[PageOutline\]\]", re.I), r""), # Convert wiki links - line = wikilink_pattern.sub(lambda m: convert_wikilinks(m, slug, "https://git.cryptech.is/"), line) + (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"), self.convert_wikilinks), # Convert striked through text - line = strikethrough_pattern.sub(r"<s>\1</s>", line) + (re.compile(r"~~([^~]+)~~"), r"<s>\1</s>"), - # Convert line breaks - # Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue? - line = linebreak_pattern.sub(" ", line) + # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue? + (re.compile(r"\\\\$"), r" "), # Convert bold and italic text (do this last) - line = line.replace("'''", "**") # Convert bold text - line = line.replace("''", "*") # Convert italic text + (re.compile(r"'''"), r"**"), + (re.compile(r"''"), r"*"), + ) + + def __call__(self, content, slug): + self.slug = slug + + old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines() + new_content = [] + + code_block = False + in_list = False + in_table = False + nested_level = 0 + prev_indent = 0 + + while old_content: + line = old_content.pop(0).rstrip() + tail = ["\n"] + while "{{{" in line or "}}}" in line: + if "{{{" in line: + code_block = True + line = line.replace("{{{", "```") + if "}}}" in line: + code_block = False + line = line.replace("}}}", "```") + if not code_block: + + # Convert tables. References: + # https://github.github.com/gfm/#tables-extension- + # https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables + # Table start: line containing "||"; table end: blank line? + # + # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does. Guess we can + # add a dummy header if no better idea. Markdown requires delimiter line, which we add immediately after the + # header, both appear to be mandatory. Trac can have label cells anywhere, not just in header, might need to + # add "*" to those or just ignore the issue. Justification we can sort of figure out from the header, + # if the rows do anything different, ouch, because markdown specifies in delimiter line. + # + # Might do something clever with the "=" markers and alignment, start with just getting the basic table + # structure to something markdown will believe. + + if line.strip().startswith("||"): + line = line.replace("=|", "|").replace("|=", "|") + line = line.replace("||", "|") + if not in_table: + tail.append("|---" * (line.count("|") - 1) + "|\n") + in_table = True + elif in_table and not line.strip().startswith("||"): + new_content.append("\n") + in_table = False + + # + # Convert bullet lists. The start and end of a list needs an empty line. + # + nested_line = line.lstrip(' ') + if nested_line.startswith('- ') or nested_line.startswith('* '): + if not in_list: + new_content.append("\n") + nested_level = 0 + prev_indent = 0 + in_list = True + indent = len(line) - len(nested_line) + if indent > prev_indent: + nested_level += 1 + elif indent < prev_indent: + nested_level -= 1 + prev_indent = indent + line = ' ' * nested_level + nested_line + elif in_list: + new_content.append("\n") + in_list = False + nested_level = 0 + prev_indent = 0 + + # Convert headers + line = self.convert_headers(line) + + # Rest is regexp-driven conversions + for pattern, action in self.pattern_actions: + line = pattern.sub(action, line) + + new_content.append(line) + new_content.extend(tail) - new_content.append(line) - new_content.extend(tail) + del self.slug - return "".join(new_content) + return "".join(new_content) |