From 18fb1695b84248fc75ceb3569ff03cbeca51a620 Mon Sep 17 00:00:00 2001
From: Rob Austein <sra@hactrn.net>
Date: Mon, 15 Feb 2021 22:29:56 +0000
Subject: Seriously rework link processing

---
 trac2md.py | 167 +++++++++++++++++++++++++++++--------------------------------
 1 file changed, 78 insertions(+), 89 deletions(-)

(limited to 'trac2md.py')

diff --git a/trac2md.py b/trac2md.py
index a8a632b..7e17c5c 100755
--- a/trac2md.py
+++ b/trac2md.py
@@ -14,16 +14,23 @@ from base64 import b64decode
 from datetime import datetime
 from urllib.parse import quote
 
+content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)
+
+traclink_pattern = re.compile(r"(?<!\[)\[([^][]+)\]")
+
 image_pattern = re.compile(r"\[\[Image\((.*)\)\]\]")
 
-wikilink_1_pattern = re.compile(r"\[\[(http.*)\]\]|\[(http.*)\]")
-wikilink_2_pattern = re.compile(r"\[\[attachment:([a-zA-Z0-9_/]+)\]\]|\[attachment:([^][]+)\]")
-wikilink_3_pattern = re.compile(r"\[\[(?:wiki:)?([^][]+)\]\]|\[wiki:([^][]+)\]")
+wikilink_pattern = re.compile(r"\[\[(wiki:|attachment:)?([^]|[]+)(?:[|]([^][]+))?\]\]")
+
+strikethrough_pattern = re.compile(r"~~([^~]+)~~")
+bangquote_pattern = re.compile(r"!((?:\w|[#])+)")
+linebreak_pattern = re.compile(r"\\\\$")
+
+camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")
 
-strikethrough_pattern = re.compile(r"~~(.*)~~")
-camelcase_pattern = re.compile(r"!((?:\w|[#])+)")
+span_pattern = re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]")
 
-span_pattern = re.compile(r"\[\[span\((?:[^][]*,)*\[([^(), ]+)([^(),]+)\]\)\]\]")
+delete_pattern = re.compile(r"\[\[PageOutline\]\]", re.I)
 
 wikiheading_patterns = tuple(
     (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
@@ -36,57 +43,50 @@ def convert_headers(line):
             level = header.search(line).group(1)
             if level:
                 line = "%s %s" % ('#' * level_count, level.rstrip("= \r\t"))
-                break  # No need to check other heading levels
+                break          # No need to check other heading levels
         except:
-            # Try the next heading level
-            pass
-
+            pass                # Try the next heading level
     return line
 
 
-def make_mdlink(text, slug, sep):
-    if sep in text:
-        parts = text.split(sep, 1)
-    else:
-        parts = [text]
-    parts = [p.strip() for p in parts]
-    for i, part in enumerate(parts):
-        if any(part.startswith(q) and part.endswith(q) for q in ('"', "'")):
-            parts[i] = part[1:-1]
-    if slug is None:
-        return "[{}]({})".format(parts[-1], parts[0])
-    else:
-        return "[{}]({{attach}}{}/{})".format(parts[-1], slug, parts[0])
-
-
-def convert_wikilinks(line, pattern, slug = None):
-    pos = 0
-    while True:
-        m = pattern.search(line, pos)
-        if not m:
-            break
-        text = m.group(1) or m.group(2)
-        if text.lower() == "pageoutline":
-            mdlink = ""
-        else:
-            mdlink = make_mdlink(text, slug, "|" if m.group(0).startswith("[[") else " ")
-        line = line.replace(m.group(0), mdlink)
-        pos = m.start() + len(mdlink)
+def convert_traclink_to_creolelink(line):
+    # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
+    # Creole's is easier to parse and harder to confuse with partially converted Markdown.
+
+    for m in traclink_pattern.finditer(line):
+        text = m.group(1).strip()
+        if " " in text:
+            line = line.replace(m.group(0), "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1)))
+        elif any(text.startswith(scheme) for scheme in ("wiki:", "attachment:")) or camelcase_pattern.match(text):
+            line = line.replace(m.group(0), "[[{}]]".format(text))
     return line
 
 
-def convert_strike(line):
-    striked_result = strikethrough_pattern.search(line)
-    if striked_result:
-        try:
-            striked_text = striked_result.group(1)
-            if striked_text:
-                orig_text = '~~%s~~' % striked_text
-                new_text = '<s>%s</s>' % striked_text
-                line = line.replace(orig_text, new_text)
-        except:
-            # Not striked
-            pass
+def convert_wikilinks(line, slug):
+    for m in wikilink_pattern.finditer(line):
+        scheme, link, text = [p.strip() if p else p for p in  m.groups()]
+        if text is None:
+            text = link
+        if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
+            link = link[1:-1]
+        if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
+            text = text[1:-1]
+        if scheme == "attachment:":
+            mdlink = "[{}]({{attach}}{}/{})".format(text, slug, link)
+        elif scheme == "wiki:" or (scheme is None and camelcase_pattern.match(link)):
+            mdlink = "[{}]({}.md)".format(text, link)
+        else:
+            mdlink = "[{}]({})".format(text, link)
+        # 
+        #whine = "/user/sra/build-tools/https-sync-repos" in line
+        whine = False
+        if whine:
+            print("Old:", line)
+        line = line.replace(m.group(0), mdlink)
+        if whine:
+            print("New:", line)
+        whine = False
+        #
     return line
 
 
@@ -106,24 +106,14 @@ def convert_image(line, slug):
     return line
 
 
-def convert_linebreak(line):
-    # Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
-    if line.endswith("\\\\"):
-        line = line[:-2] + "  "
-    return line
-
-
 def WikiToMD(content, slug):
 
-    # Line breaks in Markdown must be at end of line, so add newlines as needed
-    content = content.replace("[[br]]", "\\\\").replace("[[BR]]", "\\\\").replace("\\\\", "\\\\\n")
-
     code_block = False
     in_list = False
     in_table = False
     nested_level = 0
     prev_indent = 0
-    old_content = content.splitlines()
+    old_content = content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
     new_content = []
 
     while old_content:
@@ -137,29 +127,27 @@ def WikiToMD(content, slug):
                 code_block = False
                 line = line.replace("}}}", "```")
         if not code_block:
-            #
-            # Want to convert tables.  References:
+
+            # Convert CamelCase links to explicit links
+            line = camelcase_pattern.sub(r"[[\1]]", line)
+
+            # Convert TracLinks to WikiCreole links to simplify remaining processing
+            line = convert_traclink_to_creolelink(line)
+
+            # Convert tables.  References:
             #   https://github.github.com/gfm/#tables-extension-
             #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
+            # Table start: line containing "||"; table end: blank line?
             #
-            # Table start: line containing "||"
-            # Table end: blank line?
-            #
-            # Figuring out whether there's a real header line is fun,
-            # trac doesn't require one, markdown does.  Guess we can
-            # add a dummy header if no better idea.  Markdown requires
-            # delimiter line, which we add immediately after the
-            # header, both appear to be mandatory.  Trac can have
-            # label cells anywhere, not just in header, might need to
-            # add "*" to those or just ignore the issue.
-            # Justification we can sort of figure out from the header,
-            # if the rows do anything different, ouch, because
-            # markdown specifies in delimiter line.
+            # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
+            # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
+            # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
+            # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
+            # if the rows do anything different, ouch, because markdown specifies in delimiter line.
             #
-            # Might do something clever with the "=" markers and
-            # alignment, start with just getting the basic table
+            # Might do something clever with the "=" markers and alignment, start with just getting the basic table
             # structure to something markdown will believe.
-            #
+
             if line.strip().startswith("||"):
                 line = line.replace("=|", "|").replace("|=", "|")
                 line = line.replace("||", "|")
@@ -171,8 +159,7 @@ def WikiToMD(content, slug):
                 in_table = False
 
             #
-            # Convert bullet lists.  The start and end of a list needs
-            # an empty line.
+            # Convert bullet lists.  The start and end of a list needs an empty line.
             #
             nested_line = line.lstrip(' ')
             if nested_line.startswith('- ') or nested_line.startswith('* '):
@@ -194,11 +181,11 @@ def WikiToMD(content, slug):
                 nested_level = 0
                 prev_indent = 0
 
-            # Convert CamelCase
-            line = camelcase_pattern.sub(r"\1", line)
+            # Convert !x quoting
+            line = bangquote_pattern.sub(r"\1", line)
 
             # Convert (limited subset of) spans
-            line = span_pattern.sub(r"[[\1|\2]]", line)
+            line = span_pattern.sub(r"\1", line)
 
             # Convert headers
             line = convert_headers(line)
@@ -206,16 +193,18 @@ def WikiToMD(content, slug):
             # Convert images
             line = convert_image(line, slug)
 
+            # Delete Trac macros that have no useful counterpart
+            line = delete_pattern.sub("", line)
+
             # Convert wiki links
-            line = convert_wikilinks(line, wikilink_1_pattern)
-            line = convert_wikilinks(line, wikilink_2_pattern, slug)
-            line = convert_wikilinks(line, wikilink_3_pattern)
+            line = convert_wikilinks(line, slug)
 
             # Convert striked through text
-            line = convert_strike(line)
+            line = strikethrough_pattern.sub(r"<s>\1</s>", line)
 
             # Convert line breaks
-            line = convert_linebreak(line)
+            # Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
+            line = linebreak_pattern.sub("  ", line)
 
             # Convert bold and italic text (do this last)
             line = line.replace("'''", "**")  # Convert bold text
-- 
cgit v1.2.3