trac2md.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
pre { line-height: 125%; }
td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.hi# This started out as https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/
# which in turn said "This code mostly taken from patches to pagure_importer by mreynolds".
# Has mutated considerably since then.

import re
from urllib.parse import quote

class Trac2Markdown:

    content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
    camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")

    wikiheading_patterns = tuple(
        (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
        for level in range(1, 7)
    )

    def convert_headers(self, line):
        for level_count, header in self.wikiheading_patterns:
            try:
                level = header.search(line).group(1)
                if level:
                    line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
                    break          # No need to check other heading levels
            except:
                pass                # Try the next heading level
        return line

    def convert_to_creole(self, m):
        # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
        # Creole's is easier to parse and harder to confuse with partially converted Markdown.

        text = m.group(1).strip()
        if " " in text:
            return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
        elif ":" in text or self.camelcase_pattern.match(text):
            return "[[{}]]".format(text)
        else:
            return m.group(0)

    # Probably most of the non-wiki scheme tests should become a table in an
    # extended JSON config file which maps
    #
    #   { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }

    def convert_wikilinks(self, m):
        scheme, link, text = [p.strip() if p else p for p in  m.groups()]
        if text is None:
            text = link
        if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
            link = link[1:-1]
        if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
            text = text[1:-1]
        if text == link and link.startswith("http") and "://" in link:
            return "<{}>".format(link)
        elif scheme == "attachment:":
            return "[{}]({{attach}}{}/{})".format(text, self.slug, link)
        elif scheme in ("source:", "browser:"):
            return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/"))
        elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)):
            return "[{}]({{filename}}{}.md)".format(text, link)
        else:
            return "[{}]({})".format(text, link)

    def convert_image(self, m):
        text = m.group(1).split(",")[0].strip()
        if "://" in text:
            return "<img src=\"{}\">".format(text)
        else:
            return "![{}]({{attach}}{}/{})".format(text, self.slug, quote(text, ""))

    def __init__(self, source_url = "https://git.cryptech.is/"):
        self.source_url = source_url
        self.pattern_actions = (

            # Convert TracLinks to WikiCreole syntax to simplify remaining processing
            (re.compile(r"(?<!\[)\[([^][]+)\]"),                                        self.convert_to_creole),

            # Convert CamelCase links to explicit links
            (self.camelcase_pattern,                                                    r"[[\1]]"),

            # Convert !x quoting
            (re.compile(r"!((?:\w|[#])+)"),                                             r"\1"),

            # Convert (limited subset of) spans
            (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"),                      r"\1"),

            # Convert images
            (re.compile(r"\[\[Image\((.*)\)\]\]"),                                      self.convert_image),

            # Delete Trac macros that have no useful counterpart
            (re.compile(r"\[\[PageOutline\]\]", re.I),                                  r""),

            # Convert wiki links
            (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"),     self.convert_wikilinks),

            # Convert striked through text
            (re.compile(r"~~([^~]+)~~"),                                                r"<s>\1</s>"),

            # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
            (re.compile(r"\\\\$"),                                                      r"  "),

            # Convert bold and italic text (do this last)
            (re.compile(r"'''"),                                                        r"**"),
            (re.compile(r"''"),                                                         r"*"),
        )

    def __call__(self, content, slug):
        self.slug = slug

        old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
        new_content = []

        code_block = False
        in_list = False
        in_table = False
        nested_level = 0
        prev_indent = 0

        while old_content:
            line = old_content.pop(0).rstrip()
            tail = ["\n"]
            while "{{{" in line or "}}}" in line:
                if "{{{" in line:
                    code_block = True
                    line = line.replace("{{{", "```")
                if "}}}" in line:
                    code_block = False
                    line = line.replace("}}}", "```")
            if not code_block:

                # Convert tables.  References:
                #   https://github.github.com/gfm/#tables-extension-
                #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
                # Table start: line containing "||"; table end: blank line?
                #
                # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
                # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
                # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
                # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
                # if the rows do anything different, ouch, because markdown specifies in delimiter line.
                #
                # Might do something clever with the "=" markers and alignment, start with just getting the basic table
                # structure to something markdown will believe.

                if line.strip().startswith("||"):
                    line = line.replace("=|", "|").replace("|=", "|")
                    line = line.replace("||", "|")
                    if not in_table:
                        tail.append("|---" * (line.count("|") - 1) + "|\n")
                    in_table = True
                elif in_table and not line.strip().startswith("||"):
                    new_content.append("\n")
                    in_table = False

                #
                # Convert bullet lists.  The start and end of a list needs an empty line.
                #
                nested_line = line.lstrip(' ')
                if nested_line.startswith('- ') or nested_line.startswith('* '):
                    if not in_list:
                        new_content.append("\n")
                        nested_level = 0
                        prev_indent = 0
                        in_list = True
                    indent = len(line) - len(nested_line)
                    text_indent = len(line) - len(nested_line[1:].lstrip())
                    if indent > prev_indent:
                        nested_level += 1
                    elif indent < prev_indent:
                        nested_level -= 1
                    prev_indent = indent
                    line = '    ' * nested_level + nested_line
                elif in_list and len(line) < len(nested_line) + text_indent:
                    new_content.append("\n")
                    in_list = False
                    nested_level = 0
                    prev_indent = 0
                    text_indent = 0

                # Convert headers
                line = self.convert_headers(line)

                # Rest is regexp-driven conversions
                for pattern, action in self.pattern_actions:
                    line = pattern.sub(action, line)

            new_content.append(line)
            new_content.extend(tail)

        del self.slug

        return "".join(new_content)