path: root/trac2md.py

                      

#!/usr/bin/env python2

# Untested code from https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/

# This code mostly taken from patches to pagure_importer by mreynolds

import sys
import re
import time
import requests
import shutil
import os
from base64 import b64decode
from datetime import datetime
from urllib.parse import quote

content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)

traclink_pattern = re.compile(r"(?<!\[)\[([^][]+)\]")

image_pattern = re.compile(r"\[\[Image\((.*)\)\]\]")

wikilink_pattern = re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]")

strikethrough_pattern = re.compile(r"~~([^~]+)~~")
bangquote_pattern = re.compile(r"!((?:\w|[#])+)")
linebreak_pattern = re.compile(r"\\\\$")

camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")

span_pattern = re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]")

delete_pattern = re.compile(r"\[\[PageOutline\]\]", re.I)

wikiheading_patterns = tuple(
    (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
    for level in range(1, 7))


def convert_headers(line):
    for level_count, header in wikiheading_patterns:
        try:
            level = header.search(line).group(1)
            if level:
                line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
                break          # No need to check other heading levels
        except:
            pass                # Try the next heading level
    return line


def convert_traclink_to_creolelink(m):
    # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
    # Creole's is easier to parse and harder to confuse with partially converted Markdown.

    text = m.group(1).strip()
    if " " in text:
        return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
    elif ":" in text or camelcase_pattern.match(text):
        return "[[{}]]".format(text)
    else:
        return m.group(0)


# Probably most of the non-wiki scheme tests should become a table in an
# extended JSON config file which maps
#
#   { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }

def convert_wikilinks(m, slug, giturl):
    scheme, link, text = [p.strip() if p else p for p in  m.groups()]
    if text is None:
        text = link
    if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
        link = link[1:-1]
    if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
        text = text[1:-1]
    if text == link and link.startswith("http") and "://" in link:
        return "<{}>".format(link)
    elif scheme == "attachment:":
        return "[{}]({{attach}}{}/{})".format(text, slug, link)
    elif scheme in ("source:", "browser:"):
        return "[{}]({}/{})".format(text, giturl.rstrip("/"), link.lstrip("/"))
    elif scheme == "wiki:" or (scheme is None and camelcase_pattern.match(link)):
        return "[{}]({{filename}}{}.md)".format(text, link)
    else:
        return "[{}]({})".format(text, link)


def convert_image(m, slug):
    text = m.group(1).split(",")[0].strip()
    if "://" in text:
        return "<img src=\"{}\">".format(text)
    else:
        return "![{}]({{attach}}{}/{})".format(text, slug, quote(text, ""))


def WikiToMD(content, slug):

    code_block = False
    in_list = False
    in_table = False
    nested_level = 0
    prev_indent = 0
    old_content = content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
    new_content = []

    while old_content:
        line = old_content.pop(0).rstrip()
        tail = ["\n"]
        while "{{{" in line or "}}}" in line:
            if "{{{" in line:
                code_block = True
                line = line.replace("{{{", "```")
            if "}}}" in line:
                code_block = False
                line = line.replace("}}}", "```")
        if not code_block:

            # Convert CamelCase links to explicit links
            line = camelcase_pattern.sub(r"[[\1]]", line)

            # Convert TracLinks to WikiCreole links to simplify remaining processing
            line = traclink_pattern.sub(convert_traclink_to_creolelink, line)

            # Convert tables.  References:
            #   https://github.github.com/gfm/#tables-extension-
            #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
            # Table start: line containing "||"; table end: blank line?
            #
            # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
            # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
            # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
            # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
            # if the rows do anything different, ouch, because markdown specifies in delimiter line.
            #
            # Might do something clever with the "=" markers and alignment, start with just getting the basic table
            # structure to something markdown will believe.

            if line.strip().startswith("||"):
                line = line.replace("=|", "|").replace("|=", "|")
                line = line.replace("||", "|")
                if not in_table:
                    tail.append("|---" * (line.count("|") - 1) + "|\n")
                in_table = True
            elif in_table and not line.strip().startswith("||"):
                new_content.append("\n")
                in_table = False

            #
            # Convert bullet lists.  The start and end of a list needs an empty line.
            #
            nested_line = line.lstrip(' ')
            if nested_line.startswith('- ') or nested_line.startswith('* '):
                if not in_list:
                    new_content.append("\n")
                    nested_level = 0
                    prev_indent = 0
                    in_list = True
                indent = len(line) - len(nested_line)
                if indent > prev_indent:
                    nested_level += 1
                elif indent < prev_indent:
                    nested_level -= 1
                prev_indent = indent
                line = '    ' * nested_level + nested_line
            elif in_list:
                new_content.append("\n")
                in_list = False
                nested_level = 0
                prev_indent = 0

            # Convert !x quoting
            line = bangquote_pattern.sub(r"\1", line)

            # Convert (limited subset of) spans
            line = span_pattern.sub(r"\1", line)

            # Convert headers
            line = convert_headers(line)

            # Convert images
            line = image_pattern.sub(lambda m: convert_image(m, slug), line)

            # Delete Trac macros that have no useful counterpart
            line = delete_pattern.sub("", line)

            # Convert wiki links
            line = wikilink_pattern.sub(lambda m: convert_wikilinks(m, slug, "https://git.cryptech.is/"), line)

            # Convert striked through text
            line = strikethrough_pattern.sub(r"<s>\1</s>", line)

            # Convert line breaks
            # Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
            line = linebreak_pattern.sub("  ", line)

            # Convert bold and italic text (do this last)
            line = line.replace("'''", "**")  # Convert bold text
            line = line.replace("''", "*")  # Convert italic text

        new_content.append(line)
        new_content.extend(tail)

    return "".join(new_content)
#!/usr/bin/env python2

# Untested code from https://www.snip2code.com/Snippet/1704331/Convert-trac-markup-to-Markdown/

# This code mostly taken from patches to pagure_importer by mreynolds

import sys
import re
import time
import requests
import shutil
import os
from base64 import b64decode
from datetime import datetime
from urllib.parse import quote

content_linebreak_pattern = re.compile(r"\[\[br\]\]|\\\\", re.I)

traclink_pattern = re.compile(r"(?<!\[)\[([^][]+)\]")

image_pattern = re.compile(r"\[\[Image\((.*)\)\]\]")

wikilink_pattern = re.compile(r"\[\[(wiki:|attachment:|source:|browser:)?([^]|[]+)(?:[|]([^][]+))?\]\]")

strikethrough_pattern = re.compile(r"~~([^~]+)~~")
bangquote_pattern = re.compile(r"!((?:\w|[#])+)")
linebreak_pattern = re.compile(r"\\\\$")

camelcase_pattern = re.compile(r"(?:^|(?<=\s))([A-Z][a-z]+[A-Z][a-z][A-Za-z]*)(?:$|(?=\s))")

span_pattern = re.compile(r"\[\[span\((?:[^][]*,)*([^(),]+)\)\]\]")

delete_pattern = re.compile(r"\[\[PageOutline\]\]", re.I)

wikiheading_patterns = tuple(
    (level, re.compile("^{} (.*)[ \t]*=*$".format("=" * level)))
    for level in range(1, 7))


def convert_headers(line):
    for level_count, header in wikiheading_patterns:
        try:
            level = header.search(line).group(1)
            if level:
                line = "{} {}".format('#' * level_count, level.rstrip("= \r\t"))
                break          # No need to check other heading levels
        except:
            pass                # Try the next heading level
    return line


def convert_traclink_to_creolelink(m):
    # Convert Trac's native link form to Creole's, so that rest of the code only has to deal with one format.
    # Creole's is easier to parse and harder to confuse with partially converted Markdown.

    text = m.group(1).strip()
    if " " in text:
        return "[[{0[0]}|{0[1]}]]".format(text.split(" ", 1))
    elif ":" in text or camelcase_pattern.match(text):
        return "[[{}]]".format(text)
    else:
        return m.group(0)


# Probably most of the non-wiki scheme tests should become a table in an
# extended JSON config file which maps
#
#   { "source:fee/fie/foe/fum": "https://git.cryptech.is/blarg/blee/blue" }

def convert_wikilinks(m, slug, giturl):
    scheme, link, text = [p.strip() if p else p for p in  m.groups()]
    if text is None:
        text = link
    if any(link.startswith(q) and link.endswith(q) for q in ('"', "'")):
        link = link[1:-1]
    if any(text.startswith(q) and text.endswith(q) for q in ('"', "'")):
        text = text[1:-1]
    if text == link and link.startswith("http") and "://" in link:
        return "<{}>".format(link)
    elif scheme == "attachment:":
        return "[{}]({{attach}}{}/{})".format(text, slug, link)
    elif scheme in ("source:", "browser:"):
        return "[{}]({}/{})".format(text, giturl.rstrip("/"), link.lstrip("/"))
    elif scheme == "wiki:" or (scheme is None and camelcase_pattern.match(link)):
        return "[{}]({{filename}}{}.md)".format(text, link)
    else:
        return "[{}]({})".format(text, link)


def convert_image(m, slug):
    text = m.group(1).split(",")[0].strip()
    if "://" in text:
        return "<img src=\"{}\">".format(text)
    else:
        return "![{}]({{attach}}{}/{})".format(text, slug, quote(text, ""))


def WikiToMD(content, slug):

    code_block = False
    in_list = False
    in_table = False
    nested_level = 0
    prev_indent = 0
    old_content = content_linebreak_pattern.sub("\\\\\\\\\n", content).splitlines()
    new_content = []

    while old_content:
        line = old_content.pop(0).rstrip()
        tail = ["\n"]
        while "{{{" in line or "}}}" in line:
            if "{{{" in line:
                code_block = True
                line = line.replace("{{{", "```")
            if "}}}" in line:
                code_block = False
                line = line.replace("}}}", "```")
        if not code_block:

            # Convert CamelCase links to explicit links
            line = camelcase_pattern.sub(r"[[\1]]", line)

            # Convert TracLinks to WikiCreole links to simplify remaining processing
            line = traclink_pattern.sub(convert_traclink_to_creolelink, line)

            # Convert tables.  References:
            #   https://github.github.com/gfm/#tables-extension-
            #   https://permatrac.noc.ietf.org/wiki/WikiFormatting#Tables
            # Table start: line containing "||"; table end: blank line?
            #
            # Figuring out whether there's a real header line is fun, trac doesn't require one, markdown does.  Guess we can
            # add a dummy header if no better idea.  Markdown requires delimiter line, which we add immediately after the
            # header, both appear to be mandatory.  Trac can have label cells anywhere, not just in header, might need to
            # add "*" to those or just ignore the issue.  Justification we can sort of figure out from the header,
            # if the rows do anything different, ouch, because markdown specifies in delimiter line.
            #
            # Might do something clever with the "=" markers and alignment, start with just getting the basic table
            # structure to something markdown will believe.

            if line.strip().startswith("||"):
                line = line.replace("=|", "|").replace("|=", "|")
                line = line.replace("||", "|")
                if not in_table:
                    tail.append("|---" * (line.count("|") - 1) + "|\n")
                in_table = True
            elif in_table and not line.strip().startswith("||"):
                new_content.append("\n")
                in_table = False

            #
            # Convert bullet lists.  The start and end of a list needs an empty line.
            #
            nested_line = line.lstrip(' ')
            if nested_line.startswith('- ') or nested_line.startswith('* '):
                if not in_list:
                    new_content.append("\n")
                    nested_level = 0
                    prev_indent = 0
                    in_list = True
                indent = len(line) - len(nested_line)
                if indent > prev_indent:
                    nested_level += 1
                elif indent < prev_indent:
                    nested_level -= 1
                prev_indent = indent
                line = '    ' * nested_level + nested_line
            elif in_list:
                new_content.append("\n")
                in_list = False
                nested_level = 0
                prev_indent = 0

            # Convert !x quoting
            line = bangquote_pattern.sub(r"\1", line)

            # Convert (limited subset of) spans
            line = span_pattern.sub(r"\1", line)

            # Convert headers
            line = convert_headers(line)

            # Convert images
            line = image_pattern.sub(lambda m: convert_image(m, slug), line)

            # Delete Trac macros that have no useful counterpart
            line = delete_pattern.sub("", line)

            # Convert wiki links
            line = wikilink_pattern.sub(lambda m: convert_wikilinks(m, slug, "https://git.cryptech.is/"), line)

            # Convert striked through text
            line = strikethrough_pattern.sub(r"<s>\1</s>", line)

            # Convert line breaks
            # Markdown spec says linebreak is <SPACE><SPACE><RETURN>, who am I to argue?
            line = linebreak_pattern.sub("  ", line)

            # Convert bold and italic text (do this last)
            line = line.replace("'''", "**")  # Convert bold text
            line = line.replace("''", "*")  # Convert italic text

        new_content.append(line)
        new_content.extend(tail)

    return "".join(new_content)