Skip to content

Moin To Markdown

Moin2Markdown

Conversion object to convert MoinMoin wiki markup to Markdown

Attributes:

Name Type Description
fetch_cache FetchCache

A FetchCache object used to retrieve URLs

url_prefix furl

The URL prefix of the Moin wiki web presence

link_table

A mapping of Moin unescaped names to page names

ctx

Context object - logger and user mapping etc

create_translator(ctx, cache_directory, url_prefix, revisions) classmethod

Build a translator object

Parameters:

Name Type Description Default
ctx

Context object (logger etc)

required
cache_directory Path

Path object for the cache directory

required
url_prefix str

The base URL for the MoinMoin wiki

required
link_table

A translation table for wiki links

required
Source code in moin2gitwiki/moin2markdown.py
@classmethod
def create_translator(
    cls,
    ctx,
    cache_directory: Path,
    url_prefix: str,
    revisions: MoinEditEntries,
):
    """
    Build a translator object

    Parameters:
        ctx:            Context object (logger etc)
        cache_directory:    Path object for the cache directory
        url_prefix:     The base URL for the MoinMoin wiki
        link_table:     A translation table for wiki links

    """
    #
    # Build a fetch cache
    fetch_cache = FetchCache.initialise_cache(
        cache_directory=cache_directory,
        ctx=ctx,
    )
    return cls(
        fetch_cache=fetch_cache,
        revisions=revisions,
        url_prefix=furl(url_prefix),
        ctx=ctx,
    )

extract_content_section(self, html)

Extract the content part of the HTML, and simplify

Parameters:

Name Type Description Default
html str

The html data

required

Pulls out the content div and simplifies the HTML. Simplification consists of:-

  • stripping out redundant anchor spans
  • remove the additional line marking paragraphs
  • rewrite a/hrefs
  • strip internal a/hrefs that have no existng target
  • strip class attributes from links
  • remap any emoji img to the emoji sequence
Source code in moin2gitwiki/moin2markdown.py
def extract_content_section(self, html: str) -> str:
    """
    Extract the content part of the HTML, and simplify

    Parameters:
        html:    The html data

    Pulls out the content div and simplifies the  HTML.
    Simplification consists of:-

    - stripping out redundant anchor spans
    - remove the additional line marking paragraphs
    - rewrite a/hrefs
    - strip internal a/hrefs that have no existng target
    - strip class attributes from links
    - remap any emoji img to the emoji sequence

    """
    soup = BeautifulSoup(html, "html.parser")
    content = soup.find(id="content")
    if content is None:
        return ""
    #
    # now strip out excess rubbish - anchor spans
    for tag in content.find_all(class_="anchor"):
        tag.decompose()
    #
    # Remove dead <p class="line???"> with no closer
    for tag in content.find_all(is_a_linemark_para):
        tag.unwrap()
    #
    # now find all the links, and if within the wiki, rewrite
    for tag in content.find_all("a"):
        target = tag["href"]
        if target:
            self.ctx.logger.debug(f"Trying to map link {target}")
            url = self.url_prefix.copy().join(target)
            if url.url.startswith(self.url_prefix.url):
                new_url = (
                    url.copy().remove(query=True).url[len(self.url_prefix.url) :]
                )
                if len(str(url.query)) == 0:
                    # no query - this is a conventional link
                    new_target = self.revisions.get_new_link_target(new_url)
                    if new_target:
                        tag["href"] = new_target
                        self.ctx.logger.debug(f"Normal map -> {new_target}")
                elif (
                    "action" in url.query.params
                    and "target" in url.query.params
                    and url.query.params["action"] == "AttachFile"
                ):
                    attach_target = url.query.params["target"]
                    new_target = self.revisions.get_new_attachment_link_target(
                        new_url,
                        attach_target,
                    )
                    if new_target:
                        tag["href"] = new_target
                        self.ctx.logger.debug(f"Attach map -> {new_target}")
                else:
                    tag.unwrap()
        #
        # strip any class attributes on links - tend to upset the translator
        if tag.has_attr("class"):
            del tag["class"]
    #
    # now find all the images and see if they map to emojis
    # MoinMoin puts the emoji code in the title, so will purely match on that
    for tag in content.find_all("img"):
        target = tag["src"]
        self.ctx.logger.debug(f"Image target {target}")
        if tag.has_attr("title") and tag["title"] in self.smiley_map:
            tag.replace_with(" " + self.smiley_map[tag["title"]] + " ")
        elif target:
            # now find all the images, and if an attachment within the wiki, rewrite
            url = self.url_prefix.copy().join(target)
            if url.url.startswith(self.url_prefix.url):
                new_url = (
                    url.copy().remove(query=True).url[len(self.url_prefix.url) :]
                )
                self.ctx.logger.debug(f"Image params {url.query.params}")
                if (
                    "action" in url.query.params
                    and "target" in url.query.params
                    and url.query.params["action"] == "AttachFile"
                ):
                    attach_target = url.query.params["target"]
                    new_target = self.revisions.get_new_attachment_link_target(
                        new_url,
                        attach_target,
                    )
                    if new_target:
                        tag["src"] = new_target
                        self.ctx.logger.debug(f"Image mapped to {new_target}")
            else:
                self.ctx.logger.debug(f"Not mapped - {url.query.params}")
        #
        # strip any class attributes on links - tend to upset the translator
        if tag.has_attr("class"):
            del tag["class"]

    #
    # The forms within the data are basically useless - strip the form and input fields
    for tag in content.find_all("form"):
        tag.unwrap()
    for tag in content.find_all("input"):
        tag.decompose()

    #
    # This might not always work but removing all <div>s makes output cleaner
    for tag in content.find_all("div"):
        tag.unwrap()

    return "".join([str(x) for x in content.contents])

retrieve_and_translate(self, revision)

Retrieve a wiki revision, and translate it to markdown

Parameters:

Name Type Description Default
revision MoinEditEntry

The wiki revision object for the revision we want

required

If the revision maps to an empty object - ie it deleted the page, or similar, then a None object is returned.

Source code in moin2gitwiki/moin2markdown.py
def retrieve_and_translate(self, revision: MoinEditEntry) -> Optional[bytes]:
    """
    Retrieve a wiki revision, and translate it to markdown

    Parameters:
        revision:    The wiki revision object for the revision we want

    If the revision maps to an empty object - ie it deleted the page, or
    similar, then a None object is returned.

    """
    # check if this revision has any content...
    lines = revision.wiki_content()
    if lines is None:
        return None
    else:
        target = self.url_prefix.copy()
        target /= revision.page_path_unescaped()
        target.args["action"] = "recall"
        target.args["rev"] = revision.page_revision
        content = self.fetch_cache.fetch(target.url)
        main_content = self.extract_content_section(content)
        translated = self.translate(main_content)
        return translated

translate(self, input)

Translate HTML to Github Flavoured Markdown using pandoc

Source code in moin2gitwiki/moin2markdown.py
def translate(self, input: str) -> bytes:
    """Translate HTML to Github Flavoured Markdown using pandoc"""
    process = subprocess.Popen(
        ["pandoc", "-f", "html", "-t", "gfm"],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
    )
    (output, _) = process.communicate(input.encode("utf-8"))
    return output