summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRob Austein <sra@hactrn.net>2021-02-16 06:46:38 +0000
committerRob Austein <sra@hactrn.net>2021-02-16 06:46:38 +0000
commita4a027a6700e1197a50b24bdfbde1697049b7348 (patch)
tree6d0c71cdb020bb7b6725414ed5ca50a97d5f20e2
parent61a631cbcf8c584dd8adde13d26d005eb4675935 (diff)
Refactor Wiki2Markdown as class
Makes portions of the mess clearer, not as much as I'd hoped. With a bit more work we could fold header-crunching into the existing regexp mechanism, but code blocks, links, and tables require state.
-rwxr-xr-xextract.py4
-rwxr-xr-xtrac2md.py335
2 files changed, 168 insertions, 171 deletions
diff --git a/extract.py b/extract.py
index c7e35b9..793b502 100755
--- a/extract.py
+++ b/extract.py
@@ -92,6 +92,8 @@ def main():
os.link("pelicanconf.py", "pelican/pelicanconf.py")
+ wiki_to_markdown = trac2md.Trac2Markdown()
+
keep = Filter()
first_published = {}
@@ -105,7 +107,7 @@ def main():
#print(slug, row.version)
with open("wiki/{}.trac".format(slug), "w") as f:
f.write(row.text)
- md = markdown_header(row, first_published) + trac2md.WikiToMD(row.text, slug)
+ md = markdown_header(row, first_published) + wiki_to_markdown(row.text, slug)
with open("pelican/content/{}.md".format(slug), "w") as f:
f.write(md)
diff --git a/trac2md.py b/trac2md.py
index e16845b..c7cf85e 100755
--- a/trac2md.py
+++ b/trac2md.py
@@ -5,192 +5,187 @@
import re
from urllib.parse import quote
-content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
-
-traclink_pattern = re.compile(r"(?<!\[)\[([^][]+)\]")
-
-image_pattern = re.compile(r"\[\[Image\((.*)\)\]\]")
-
-wikilink_pattern = re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]")
-
-strikethrough_pattern = re.compile(r"~~([^~]+)~~")
-bangquote_pattern = re.compile(r"!((?:\w|[#])+)")
-linebreak_pattern = re.compile(r"\\\\$")
-
-camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")
-
-span_pattern = re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]")
-
-delete_pattern = re.compile(r"\[\[PageOutline\]\]", re.I)
-
-wikiheading_patterns = tuple(
- (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
- for level in range(1, 7))
-
-
-def convert_headers(line):
- for level_count, header in wikiheading_patterns:
- try:
- level = header.search(line).group(1)
- if level:
- line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
- break # No need to check other heading levels
- except:
- pass # Try the next heading level
- return line
-
-
-def convert_traclink_to_creolelink(m):
- # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
- # Creole's is easier to parse and harder to confuse with partially converted Markdown.
-
- text = m.group(1).strip()
- if " " in text:
- return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
- elif ":" in text or camelcase_pattern.match(text):
- return "[[{}]]".format(text)
- else:
- return m.group(0)
-
-
-# Probably most of the non-wiki scheme tests should become a table in an
-# extended JSON config file which maps
-#
-# { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }
-
-def convert_wikilinks(m, slug, giturl):
- scheme, link, text = [p.strip() if p else p for p in m.groups()]
- if text is None:
- text = link
- if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
- link = link[1:-1]
- if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
- text = text[1:-1]
- if text == link and link.startswith("http") and "://" in link:
- return "<{}>".format(link)
- elif scheme == "attachment:":
- return "[{}]({{attach}}{}/{})".format(text, slug, link)
- elif scheme in ("source:", "browser:"):
- return "[{}]({}/{})".format(text, giturl.rstrip("/"), link.lstrip("/"))
- elif scheme == "wiki:" or (scheme is None and camelcase_pattern.match(link)):
- return "[{}]({{filename}}{}.md)".format(text, link)
- else:
- return "[{}]({})".format(text, link)
-
-
-def convert_image(m, slug):
- text = m.group(1).split(",")[0].strip()
- if "://" in text:
- return "<img src=\"{}\">".format(text)
- else:
- return "![{}]({{attach}}{}/{})".format(text, slug, quote(text, ""))
-
-
-def WikiToMD(content, slug):
-
- code_block = False
- in_list = False
- in_table = False
- nested_level = 0
- prev_indent = 0
- old_content = content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
- new_content = []
-
- while old_content:
- line = old_content.pop(0).rstrip()
- tail = ["\n"]
- while "{{{" in line or "}}}" in line:
- if "{{{" in line:
- code_block = True
- line = line.replace("{{{", "```")
- if "}}}" in line:
- code_block = False
- line = line.replace("}}}", "```")
- if not code_block:
+class Trac2Markdown:
+
+ content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
+ camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")
+
+ wikiheading_patterns = tuple(
+ (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
+ for level in range(1, 7)
+ )
+
+ def convert_headers(self, line):
+ for level_count, header in self.wikiheading_patterns:
+ try:
+ level = header.search(line).group(1)
+ if level:
+ line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
+ break # No need to check other heading levels
+ except:
+ pass # Try the next heading level
+ return line
+
+ def convert_to_creole(self, m):
+ # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
+ # Creole's is easier to parse and harder to confuse with partially converted Markdown.
+
+ text = m.group(1).strip()
+ if " " in text:
+ return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
+ elif ":" in text or self.camelcase_pattern.match(text):
+ return "[[{}]]".format(text)
+ else:
+ return m.group(0)
+
+ # Probably most of the non-wiki scheme tests should become a table in an
+ # extended JSON config file which maps
+ #
+ # { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }
+
+ def convert_wikilinks(self, m):
+ scheme, link, text = [p.strip() if p else p for p in m.groups()]
+ if text is None:
+ text = link
+ if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
+ link = link[1:-1]
+ if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
+ text = text[1:-1]
+ if text == link and link.startswith("http") and "://" in link:
+ return "<{}>".format(link)
+ elif scheme == "attachment:":
+ return "[{}]({{attach}}{}/{})".format(text, self.slug, link)
+ elif scheme in ("source:", "browser:"):
+ return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/"))
+ elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)):
+ return "[{}]({{filename}}{}.md)".format(text, link)
+ else:
+ return "[{}]({})".format(text, link)
+
+ def convert_image(self, m):
+ text = m.group(1).split(",")[0].strip()
+ if "://" in text:
+ return "<img src=\"{}\">".format(text)
+ else:
+ return "![{}]({{attach}}{}/{})".format(text, self.slug, quote(text, ""))
+
+ def __init__(self, source_url = "https://git.cryptech.is/"):
+ self.source_url = source_url
+ self.pattern_actions = (
# Convert CamelCase links to explicit links
- line = camelcase_pattern.sub(r"[[\1]]", line)
-
- # Convert TracLinks to WikiCreole links to simplify remaining processing
- line = traclink_pattern.sub(convert_traclink_to_creolelink, line)
-
- # Convert tables. References:
- # https://github.github.com/gfm/#tables-extension-
- # https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
- # Table start: line containing "||"; table end: blank line?
- #
- # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does. Guess we can
- # add a dummy header if no better idea. Markdown requires delimiter line, which we add immediately after the
- # header, both appear to be mandatory. Trac can have label cells anywhere, not just in header, might need to
- # add "*" to those or just ignore the issue. Justification we can sort of figure out from the header,
- # if the rows do anything different, ouch, because markdown specifies in delimiter line.
- #
- # Might do something clever with the "=" markers and alignment, start with just getting the basic table
- # structure to something markdown will believe.
-
- if line.strip().startswith("||"):
- line = line.replace("=|", "|").replace("|=", "|")
- line = line.replace("||", "|")
- if not in_table:
- tail.append("|---" * (line.count("|") - 1) + "|\n")
- in_table = True
- elif in_table and not line.strip().startswith("||"):
- new_content.append("\n")
- in_table = False
-
- #
- # Convert bullet lists. The start and end of a list needs an empty line.
- #
- nested_line = line.lstrip(' ')
- if nested_line.startswith('- ') or nested_line.startswith('* '):
- if not in_list:
- new_content.append("\n")
- nested_level = 0
- prev_indent = 0
- in_list = True
- indent = len(line) - len(nested_line)
- if indent > prev_indent:
- nested_level += 1
- elif indent < prev_indent:
- nested_level -= 1
- prev_indent = indent
- line = ' ' * nested_level + nested_line
- elif in_list:
- new_content.append("\n")
- in_list = False
- nested_level = 0
- prev_indent = 0
+ (self.camelcase_pattern, r"[[\1]]"),
+
+ # Convert TracLinks to WikiCreole syntax to simplify remaining processing
+ (re.compile(r"(?<!\[)\[([^][]+)\]"), self.convert_to_creole),
# Convert !x quoting
- line = bangquote_pattern.sub(r"\1", line)
+ (re.compile(r"!((?:\w|[#])+)"), r"\1"),
# Convert (limited subset of) spans
- line = span_pattern.sub(r"\1", line)
-
- # Convert headers
- line = convert_headers(line)
+ (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"), r"\1"),
# Convert images
- line = image_pattern.sub(lambda m: convert_image(m, slug), line)
+ (re.compile(r"\[\[Image\((.*)\)\]\]"), self.convert_image),
# Delete Trac macros that have no useful counterpart
- line = delete_pattern.sub("", line)
+ (re.compile(r"\[\[PageOutline\]\]", re.I), r""),
# Convert wiki links
- line = wikilink_pattern.sub(lambda m: convert_wikilinks(m, slug, "https://git.cryptech.is/"), line)
+ (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"), self.convert_wikilinks),
# Convert striked through text
- line = strikethrough_pattern.sub(r"<s>\1</s>", line)
+ (re.compile(r"~~([^~]+)~~"), r"<s>\1</s>"),
- # Convert line breaks
- # Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
- line = linebreak_pattern.sub(" ", line)
+ # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
+ (re.compile(r"\\\\$"), r" "),
# Convert bold and italic text (do this last)
- line = line.replace("'''", "**") # Convert bold text
- line = line.replace("''", "*") # Convert italic text
+ (re.compile(r"'''"), r"**"),
+ (re.compile(r"''"), r"*"),
+ )
+
+ def __call__(self, content, slug):
+ self.slug = slug
+
+ old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
+ new_content = []
+
+ code_block = False
+ in_list = False
+ in_table = False
+ nested_level = 0
+ prev_indent = 0
+
+ while old_content:
+ line = old_content.pop(0).rstrip()
+ tail = ["\n"]
+ while "{{{" in line or "}}}" in line:
+ if "{{{" in line:
+ code_block = True
+ line = line.replace("{{{", "```")
+ if "}}}" in line:
+ code_block = False
+ line = line.replace("}}}", "```")
+ if not code_block:
+
+ # Convert tables. References:
+ # https://github.github.com/gfm/#tables-extension-
+ # https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
+ # Table start: line containing "||"; table end: blank line?
+ #
+ # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does. Guess we can
+ # add a dummy header if no better idea. Markdown requires delimiter line, which we add immediately after the
+ # header, both appear to be mandatory. Trac can have label cells anywhere, not just in header, might need to
+ # add "*" to those or just ignore the issue. Justification we can sort of figure out from the header,
+ # if the rows do anything different, ouch, because markdown specifies in delimiter line.
+ #
+ # Might do something clever with the "=" markers and alignment, start with just getting the basic table
+ # structure to something markdown will believe.
+
+ if line.strip().startswith("||"):
+ line = line.replace("=|", "|").replace("|=", "|")
+ line = line.replace("||", "|")
+ if not in_table:
+ tail.append("|---" * (line.count("|") - 1) + "|\n")
+ in_table = True
+ elif in_table and not line.strip().startswith("||"):
+ new_content.append("\n")
+ in_table = False
+
+ #
+ # Convert bullet lists. The start and end of a list needs an empty line.
+ #
+ nested_line = line.lstrip(' ')
+ if nested_line.startswith('- ') or nested_line.startswith('* '):
+ if not in_list:
+ new_content.append("\n")
+ nested_level = 0
+ prev_indent = 0
+ in_list = True
+ indent = len(line) - len(nested_line)
+ if indent > prev_indent:
+ nested_level += 1
+ elif indent < prev_indent:
+ nested_level -= 1
+ prev_indent = indent
+ line = ' ' * nested_level + nested_line
+ elif in_list:
+ new_content.append("\n")
+ in_list = False
+ nested_level = 0
+ prev_indent = 0
+
+ # Convert headers
+ line = self.convert_headers(line)
+
+ # Rest is regexp-driven conversions
+ for pattern, action in self.pattern_actions:
+ line = pattern.sub(action, line)
+
+ new_content.append(line)
+ new_content.extend(tail)
- new_content.append(line)
- new_content.extend(tail)
+ del self.slug
- return "".join(new_content)
+ return "".join(new_content)