trac2md.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191

# This started out as https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/
# which in turn said "This code mostly taken from patches to pagure_importer by mreynolds".
# Has mutated considerably since then.

import re
from urllib.parse import quote

class Trac2Markdown:

    content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
    camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")

    wikiheading_patterns = tuple(
        (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
        for level in range(1, 7)
    )

    def convert_headers(self, line):
        for level_count, header in self.wikiheading_patterns:
            try:
                level = header.search(line).group(1)
                if level:
                    line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
                    break          # No need to check other heading levels
            except:
                pass                # Try the next heading level
        return line

    def convert_to_creole(self, m):
        # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
        # Creole's is easier to parse and harder to confuse with partially converted Markdown.

        text = m.group(1).strip()
        if " " in text:
            return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
        elif ":" in text or self.camelcase_pattern.match(text):
            return "[[{}]]".format(text)
        else:
            return m.group(0)

    # Probably most of the non-wiki scheme tests should become a table in an
    # extended JSON config file which maps
    #
    #   { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }

    def convert_wikilinks(self, m):
        scheme, link, text = [p.strip() if p else p for p in  m.groups()]
        if text is None:
            text = link
        if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
            link = link[1:-1]
        if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
            text = text[1:-1]
        if text == link and link.startswith("http") and "://" in link:
            return "<{}>".format(link)
        elif scheme == "attachment:":
            return "[{}]({{attach}}{}/{})".format(text, self.slug, link)
        elif scheme in ("source:", "browser:"):
            return "[{}]({}/{})".format(text, self.source_url.rstrip("/"), link.lstrip("/"))
        elif scheme == "wiki:" or (scheme is None and self.camelcase_pattern.match(link)):
            return "[{}]({{filename}}{}.md)".format(text, link)
        else:
            return "[{}]({})".format(text, link)

    def convert_image(self, m):
        text = m.group(1).split(",")[0].strip()
        if "://" in text:
            return "<img src=\"{}\">".format(text)
        else:
            return "![{}]({{attach}}{}/{})".format(text, self.slug, quote(text, ""))

    def __init__(self, source_url = "https://git.cryptech.is/"):
        self.source_url = source_url
        self.pattern_actions = (

            # Convert CamelCase links to explicit links
            (self.camelcase_pattern,                                                    r"[[\1]]"),

            # Convert TracLinks to WikiCreole syntax to simplify remaining processing
            (re.compile(r"(?<!\[)\[([^][]+)\]"),                                        self.convert_to_creole),

            # Convert !x quoting
            (re.compile(r"!((?:\w|[#])+)"),                                             r"\1"),

            # Convert (limited subset of) spans
            (re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]"),                      r"\1"),

            # Convert images
            (re.compile(r"\[\[Image\((.*)\)\]\]"),                                      self.convert_image),

            # Delete Trac macros that have no useful counterpart
            (re.compile(r"\[\[PageOutline\]\]", re.I),                                  r""),

            # Convert wiki links
            (re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]"),     self.convert_wikilinks),

            # Convert striked through text
            (re.compile(r"~~([^~]+)~~"),                                                r"<s>\1</s>"),

            # Convert line breaks -- Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
            (re.compile(r"\\\\$"),                                                      r"  "),

            # Convert bold and italic text (do this last)
            (re.compile(r"'''"),                                                        r"**"),
            (re.compile(r"''"),                                                         r"*"),
        )

    def __call__(self, content, slug):
        self.slug = slug

        old_content = self.content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
        new_content = []

        code_block = False
        in_list = False
        in_table = False
        nested_level = 0
        prev_indent = 0

        while old_content:
            line = old_content.pop(0).rstrip()
            tail = ["\n"]
            while "{{{" in line or "}}}" in line:
                if "{{{" in line:
                    code_block = True
                    line = line.replace("{{{", "```")
                if "}}}" in line:
                    code_block = False
                    line = line.replace("}}}", "```")
            if not code_block:

                # Convert tables.  References:
                #   https://github.github.com/gfm/#tables-extension-
                #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
                # Table start: line containing "||"; table end: blank line?
                #
                # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
                # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
                # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
                # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
                # if the rows do anything different, ouch, because markdown specifies in delimiter line.
                #
                # Might do something clever with the "=" markers and alignment, start with just getting the basic table
                # structure to something markdown will believe.

                if line.strip().startswith("||"):
                    line = line.replace("=|", "|").replace("|=", "|")
                    line = line.replace("||", "|")
                    if not in_table:
                        tail.append("|---" * (line.count("|") - 1) + "|\n")
                    in_table = True
                elif in_table and not line.strip().startswith("||"):
                    new_content.append("\n")
                    in_table = False

                #
                # Convert bullet lists.  The start and end of a list needs an empty line.
                #
                nested_line = line.lstrip(' ')
                if nested_line.startswith('- ') or nested_line.startswith('* '):
                    if not in_list:
                        new_content.append("\n")
                        nested_level = 0
                        prev_indent = 0
                        in_list = True
                    indent = len(line) - len(nested_line)
                    if indent > prev_indent:
                        nested_level += 1
                    elif indent < prev_indent:
                        nested_level -= 1
                    prev_indent = indent
                    line = '    ' * nested_level + nested_line
                elif in_list:
                    new_content.append("\n")
                    in_list = False
                    nested_level = 0
                    prev_indent = 0

                # Convert headers
                line = self.convert_headers(line)

                # Rest is regexp-driven conversions
                for pattern, action in self.pattern_actions:
                    line = pattern.sub(action, line)

            new_content.append(line)
            new_content.extend(tail)

        del self.slug

        return "".join(new_content)