From 95b84a79c4e6abde1ca6b1689b7c449ff7cf5aa1 Mon Sep 17 00:00:00 2001 From: Yorik van Havre Date: Fri, 3 Sep 2021 12:21:46 +0200 Subject: [PATCH] Initial commit --- .gitignore | 1 + README.md | 14 + migrate.py | 737 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 752 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 migrate.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..a6c57f5fb2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.json diff --git a/README.md b/README.md new file mode 100644 index 0000000000..32958ea3f4 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# FreeCAD documentation + +This repository contains an automatic conversion of the FreeCAD wiki located at https://wiki.freecadweb.org to [markdown format](https://en.wikipedia.org/wiki/Markdown). The conversion is done automatically through the [migrate.py](./migrate.py) script located within this repository. The script downloads the wiki contents in XML format using the MediaWiki API and uses [pandoc](https://pandoc.org/) to convert between mediawiki and markdown formats. + +The general idea is to keep allowing users to use the wiki to edit and add contents, while the markdown format for: + +1. Better handling of the documentation within FreeCAD, have the ability to code an own help viewer better than the QAssistant viewer currently in use +2. Allow to use either an online or an offline version of the documentation +3. Automatic and easier backups, same as the FreeCAD source code +4. Better versioning and matching to FreeCAD versions +5. Better handling of translations + +Read on to the [Documentation home page](wiki/main_page.md) + diff --git a/migrate.py b/migrate.py new file mode 100755 index 0000000000..854881cd0b --- /dev/null +++ b/migrate.py @@ -0,0 +1,737 @@ +#!/usr/bin/env python3 + +#*************************************************************************** +#* Copyright (c) 2021 Yorik van Havre * +#* * +#* This program is free software; you can redistribute it and/or modify * +#* it under the terms of the GNU Lesser General Public License (LGPL) * +#* as published by the Free Software Foundation; either version 2 of * +#* the License, or (at your option) any later version. * +#* for detail see the LICENCE text file. * +#* * +#* This program is distributed in the hope that it will be useful, * +#* but WITHOUT ANY WARRANTY; without even the implied warranty of * +#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +#* GNU Library General Public License for more details. * +#* * +#* You should have received a copy of the GNU Library General Public * +#* License along with this program; if not, write to the Free Software * +#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * +#* USA * +#* * +#*************************************************************************** + +""" +migrate.py - MediaWiki to Markdown migration tool + +This tool allows to download all pages and images from a MediaWiki instance, +and save all images and pages as markdown files. It keeps track of page versions, +so it can update an existing download by just updating what has changed. + +Basic command-line usage: +------------------------- + + migrate.py --init + + See bottom of file for available functions + +Basic Python usage: +------------------- + + from migrate import MediaWiki + wiki = MediaWiki([url]) + wiki.init() # do this the first time. You can interrupt and resume init() later + wiki.update() # do this next times + +Detailed Python usage: +---------------------- + +First time: + + from migrate import MediaWiki + wiki = MediaWiki(url="https://wiki.freecadweb.org") + wiki.getPageNames() # not strictly needed, done automatically by next step + wiki.getAllPages() # this writes to disk cache, so it can be interrupted/resumed + wiki.writeAllPages() # write md files + wiki.getAllImages() # fetches and saves all images + +You will get two json files a text cache and a revision set. + +Next times: + + from migrate import MediaWiki + wiki = MediaWiki(url="https://wiki.freecadweb.org") # this loads filecache.json + oldrevisions = wiki.readRevisions() # this loads the latest revision set + wiki.getPageNames() # this fetches new page names, if any + newrevisions = wiki.getRevisions() + wiki.updateRevisions(oldrevisions,newrevisions) # this writes to disk cache like getAllPages + wiki.writeAllPages() # write md files + wiki.getAllImages() # fetches and saves all images - existing ones are not overwritten +""" + +import requests +import sys +import os +import re +import json +import datetime +import pypandoc + +class MediaWiki: + + """MediaWiki([url]) - A class to represent an online MediaWiki instance. + default url is wiki.freecadweb.org""" + + def __init__(self,url="https://wiki.freecadweb.org"): + + if not url.endswith("api.php"): + if not url.endswith("/"): + url += "/" + url += "api.php" + self.url = url + self.cachefile = os.path.join(os.path.dirname(__file__),"cache.json") + self.session = requests.Session() + self.pagenames = [] + self.pagecount = 0 + self.pagecontents = {} + self.readCache() + self.imagecount = 0 + self.images = {} + self.output = os.path.join(os.path.dirname(__file__),"wiki") + if not os.path.exists(self.output): + os.mkdir(self.output) + self.imagefolder = "images" + self.translationfolder = "translations" + + ### MAIN TOOLS + + + def init(self): + + """init(): + Performs an initial import of the pages and images. + If a cache already exists, it will be fully overwritten.""" + + self.getPageNames() + self.getAllPages() + + + def update(self): + + """update(): + Performs initial import if running for the first time, + or updates the contents if a cache already exists""" + + if not self.pagecontents: + self.init() + oldrevisions = self.readRevisions() + self.getPageNames() + newrevisions = self.getRevisions() + self.updateRevisions(oldrevisions,newrevisions) + + + ### UTILS + + + def printProgress(self,count=None,total=100,text=""): + + """printProgress([count,total,text]): + Prints a progress bar indicating the current progresses (count/total) + and optionally an info text. Calling printProgress() terminates with a + newline""" + + sys.stdout.flush() + if count is None: + sys.stdout.write("\r"+" ".ljust(72)+"\r") + return + sys.stdout.write(('\r'+text+' '+str(int((count/total)*100))+'%').ljust(72)) + + + ### CACHE OPERATIONS + + + def writeCache(self): + + """writeCache(): + Writes the contents of self.pagecontents to disk""" + + if self.pagecontents: + with open(self.cachefile,'w',encoding='utf8') as jfile: + json.dump(self.pagecontents, jfile, ensure_ascii = False) + + + def readCache(self): + + """readCache(): + Reads the contents of self.pagecontents from disk""" + + if os.path.exists(self.cachefile): + with open(self.cachefile) as jfile: + self.pagecontents = json.load(jfile) + self.pagenames = self.pagecontents.keys() + self.pagecount = len(self.pagecontents) + + + ### PAGE OPERATIONS + + + def getPageCount(self): + + """getPageCount(): + Returns the number of content pages. + Also stores the total in self.pagecount""" + + params = { "action": "query", + "meta": "siteinfo", + "formatversion": "2", + "format": "json", + "siprop": "statistics", + } + result = self.session.get(url=self.url, params=params) + data = result.json() + pagecount = data["query"]["statistics"]["articles"] + self.pagecount = pagecount + return pagecount + + + def getPageNames(self): + + """getPageNames(): + Returns a list of all pages of the wiki. + Also stores the list in self.pagenames""" + + pages = [] + count = 1 + apfrom = None + params = { "action": "query", + "format": "json", + "list": "allpages", + "aplimit": "500", + } + if not self.pagecount: + self.getPageCount() + while True: + self.printProgress(len(pages),self.pagecount,"Getting pages list...") + if apfrom: + params["apfrom"] = apfrom + else: + if "apfrom" in params: + break + result = self.session.get(url=self.url, params=params) + data = result.json() + pages.extend([page["title"].replace(" ","_") for page in data["query"]["allpages"]]) + apfrom = None + if "continue" in data: + if "apcontinue" in data["continue"]: + if data["continue"]["apcontinue"]: + apfrom = data["continue"]["apcontinue"] + count += 1 + self.printProgress() + self.pagenames = pages + self.pagecount = len(self.pagenames) + return pages + + + def getPage(self,name): + + """getPage(name): + Returns the wiki content of a page and a revision ID. + Also stores it in self.pagecontents[name]""" + + params = { "action": "parse", + "format": "json", + "page": name, + "prop": "wikitext|revid", + "formatversion": "2", + } + result = self.session.get(url=self.url, params=params) + data = result.json() + wikitext = data["parse"]["wikitext"] + revision = data["parse"]["revid"] + self.pagecontents[name] = wikitext + return wikitext,revision + + + def getAllPages(self,pageset=None): + + """getAllPages([pageset]) + Gets the contents of all of the wiki pages + and stores everything in self.pagecontents. If pageset is + not given, all pages from self.pagenames are retrieved""" + + revfile = os.path.join(os.path.dirname(__file__),"revisions_tmp.json") + revisions = self.readRevisions(revfile) + overwrite = True + if not pageset: + overwrite = False + if not self.pagenames: + self.getPageNames() + pageset = self.pagenames + count = 1 + for page in pageset: + self.printProgress(count,len(pageset),"Getting page "+page+"...") + if overwrite or (not page in self.pagecontents): + text,revid = self.getPage(page) + revisions[page] = revid + if count % 10 == 0: + self.writeCache() + self.writeRevisions(revisions,revfile) + count += 1 + self.writeCache() + self.printProgress() + if os.path.exists(revfile): + os.remove(revfile) + self.writeRevisions(revisions) + return revisions + + + ### IMAGE OPERATIONS + + + def getImageCount(self): + + """getImageCount(): + Returns the number of images. + Also stores the total in self.imagecount""" + + params = { "action": "query", + "meta": "siteinfo", + "formatversion": "2", + "format": "json", + "siprop": "statistics", + } + result = self.session.get(url=self.url, params=params) + data = result.json() + imagecount = data["query"]["statistics"]["images"] + self.imagecount = imagecount + return imagecount + + + def getImageNames(self): + + """getImageNames(): + Returns a list of all images of the wiki. + Also stores the paths in self.images""" + + images = {} + count = 1 + aifrom = None + params = { "action": "query", + "format": "json", + "list": "allimages", + "ailimit": "100", + } + if not self.imagecount: + self.getImageCount() + while True: + self.printProgress(count,self.imagecount,"Getting images list...") + if aifrom: + params["aifrom"] = aifrom + else: + if "aifrom" in params: + break + result = self.session.get(url=self.url, params=params) + data = result.json() + for image in data["query"]["allimages"]: + images[image["name"]] = image["url"] + count += 1 + aifrom = None + if "continue" in data: + if "aicontinue" in data["continue"]: + if data["continue"]["aicontinue"]: + aifrom = data["continue"]["aicontinue"] + self.printProgress() + self.images = images + self.imagecount = len(self.images.keys()) + return images + + + def getImage(self,name,override=False,basepath=None): + + """getImage(name,[override,basepath]): + Downloads and saves the given image in an images subfolder. + If no basepath is given, the current dir is used. + If override is True, file is downloaded again""" + + if not basepath: + basepath = self.output + basedir = os.path.join(basepath,self.imagefolder) + if not os.path.isdir(basedir): + os.mkdir(basedir) + filename = os.path.join(basedir,name) + if override or (not os.path.exists(filename)): + filecontents = requests.get(self.images[name]) + with open(filename,"wb") as imagefile: + imagefile.write(filecontents.content) + + + def getAllImages(self,override=False,basepath=None): + + """getAllImages([override]): + Saves all images to disk. + If no basepath is given, the current dir is used. + If override is True, file is downloaded again""" + + if not basepath: + basepath = self.output + if not self.images: + self.getImageNames() + count = 1 + for name in self.images.keys(): + self.printProgress(count,self.imagecount,"Downloading "+name+"...") + self.getImage(name,override,basepath) + count += 1 + + + ### REVISION OPERATIONS + + + def getRevisions(self): + + """getRevisions(): + Returns a list of revisions for the pages stored in self.pagenames""" + + revisions = {} + count = 1 + apfrom = None + params = { "action": "query", + "format": "json", + "prop": "revisions", + "rvprop": "ids", + "rvslots": "main", + "formatversion": "2", + } + if not self.pagenames: + self.getPageNames() + self.printProgress(0,100,"Getting pages revisions...") + titles = "" + for page in self.pagenames: + if titles: + titles += "|" + titles += page + count += 1 + if count % 10 != 0: + params["titles"] = titles + result = self.session.get(url=self.url, params=params) + data = result.json() + for page in data["query"]["pages"]: + if "revisions" in page: + revisions[page["title"]] = page["revisions"][0]["revid"] + self.printProgress(count,self.pagecount,"Getting pages revisions...") + titles = "" + self.printProgress() + return revisions + + + def writeRevisions(self,revisions,filename=None): + + """writeRevisions(revisions,[filename]): + Writes the contents of revisions to disk. If no filename is + given, one is created automatically from the current timestamp""" + + if not filename: + d = str(datetime.datetime.now()) + fp = os.path.dirname(__file__) + filename = os.path.join(fp,"revisions_"+d+".json") + with open(filename,'w',encoding='utf8') as jfile: + json.dump(revisions, jfile, ensure_ascii = False) + + + def readRevisions(self,filename=None): + + """readRevisions([filename]): + Returns a revisions dictionary previously stored in a file. If + no filename is given, the latest one is taken""" + + data = {} + if not filename: + files = os.listdir(os.path.dirname(__file__)) + jsonfiles = [f for f in files if (f.endswith(".json") and f.startswith("revision"))] + jsonfiles.sort(key=os.path.getmtime, reverse=True) + filename = os.path.join(os.path.dirname(__file__),jsonfiles[0]) + print("Opening",filename) + if os.path.exists(filename): + with open(filename) as jfile: + data = json.load(jfile) + return data + + + def updateRevisions(self,oldrevision,newrevision): + + """updateRevisions(oldrevision,newrevision): + Reads again the pages that need an update""" + + pageset = [] + for page,revid in newrevision.items(): + if page in oldrevision: + if oldrevision[page] == revid: + continue + pageset.append(page) + self.getAllPages(pageset) + + + ### MARKDOWN OPERATIONS + + + def getMarkdown(self,wikitext,clean=True): + + """getMarkdown(wikitext,[clean]): + Returns a markdown version of a text in wiki format. + If clean is false, raw pandoc output is returned""" + + xargs = ['--atx-headers'] # pandoc arguments + try: + fmt = 'markdown+hard_line_breaks' + result = pypandoc.convert_text(wikitext, fmt, format='mediawiki', extra_args=xargs) + except: + return None + else: + if clean: + return self.cleanMarkdown(result) + else: + return result + + + def cleanMarkdown(self,mdtext,imagepath=None): + + """cleanMarkdown(mdtext,[imagepath]): + Returns a cleaned version of the given markdown text. + Imagepath indicates the location of images relative to this page + (default = "images")""" + + result = mdtext + flags = re.DOTALL|re.MULTILINE + if not imagepath: + imagepath = self.imagefolder + + # templates that are safe to remove entirely + unusedtemplates = ["Userdocnavi","Arch Tools navi","\\#translation:","clear", + "Part Tools navi","Draft Tools navi", + ] + + # path replacements + result = re.sub("\!\[(.*?)\]\(",r"![\1]("+imagepath+"/",result) # add /image to image paths + result = re.sub(" \"wikilink\"",".md",result) # add .md to wiki page links + + # template fixes + result = re.sub("\`.*?\`\{\=html\}","",result) # remove html tags + result = re.sub("","",result,flags=flags) # remove html comments + result = re.sub("{{Docnav.*?}}","",result,flags=flags) # remove {{Docnav}} templates + result = re.sub("{{Caption\|(.*?)}}",r"*\1*",result,flags=flags) # replace {{Caption}} templates + result = re.sub("{{KEY\|(.*?)}}",r"**\1**",result,flags=flags) # replace {{KEY}} templates + result = re.sub("{{Button\|(.*?)}}",r"**\1**",result,flags=flags) # replace {{Button}} templates + result = re.sub("{{PropertyData\|(.*?)}}",r"**\1**",result,flags=flags) # replace property templates + result = re.sub("{{PropertyView\|(.*?)}}",r"**\1**",result,flags=flags) # replace property templates + result = re.sub("{{Version\|(.*?)}}",r"(v\1) ",result,flags=flags) # replace {{Version}} templates + result = re.sub("{{version\|(.*?)}}",r"(v\1) ",result,flags=flags) # replace {{Version}} templates + result = re.sub("{{VersionPlus\|(.*?)}}",r"(v\1) ",result,flags=flags) # replace {{Version}} templates + result = re.sub("{{Emphasis\|(.*?)}}",r"**\1**",result,flags=flags) # replace {{Emphasis}} templates + result = re.sub("{{\#translation\:}}","",result,flags=flags) # replace {{translation}} templates + + # turning GuiCommand block into YAML + if "{{GuiCommand" in result: + guicommandblk = re.findall("```{\=mediawiki}.*?{{GuiCommand(.*?)}}\n```",result,flags=flags) + if guicommandblk: + guicommandblk = guicommandblk[0] + guicommandblk = re.sub("\|(.*?)\=(.*?)",r" \1:\2",guicommandblk) # fixing GuiCommand contents + result = re.sub("```{\=mediawiki}.*?{{GuiCommand(.*?)}}\n```",r"---\n- GuiCommand:"+guicommandblk+"---\n",result,flags=flags) + result = "---"+"---".join(result.split("---")[1:]) # removing empty line before yaml block + + # remove code fences + result = re.sub("\`","",result) + result = re.sub("\{\=mediawiki\}","",result) + result = re.sub("\{\:mediawiki\}","",result) + + # creating new code fences + result = re.sub("{{Code\|code\=(.*?)}}",r"```python\1```",result,flags=flags) # replace {{Code}} templates + result = re.sub("{{incode\|(.*?)}}",r"`\1`",result,flags=flags) # replace {{incode}} templates + result = re.sub(" \`\`\`",r" \n```",result,flags=flags) # make sure all ``` are on a new line + result = re.sub("{{TRUE}}",r"`True`",result,flags=flags) # replace {{TRUE}} templates + result = re.sub("{{FALSE}}",r"`False`",result,flags=flags) # replace {{TRUE}} templates + + # fixing links + result = re.sub("(!\[.*?\]\(.*?)\".*?\"(\))",r"\1\2",result) # remove image path descriptions + for l1 in re.findall("\[\[Image\:.*?\|.*?px\]\]",result): + iml1 = re.findall("Image\:(.*?)\|",l1)[0] + iml2 = re.findall("\|(.*?)px\]\]",l1)[0] + l2 = "" + result = result.replace(l1,l2) + result = re.sub("\[\[Image\:(.*?)\|(.*?)\]\]",r"![]("+imagepath+"/\1)",result) + result = re.sub("\[\[(.*?)\|(.*?)\]\]",r"[\2](\1.md)",result) + result = re.sub(r"\]\(.*?\.",lambda x:x.group().replace(" ","_"),result) # replace spaces by underscores in all remaining links + result = re.sub("!\[(.*?)\]\((.*?)\){width=\"(.*?)\"}",r'\1',result) # fix img sizes + #for l in re.findall("\[.*?\]\(.*?\)",result): + # print(" ",l) + + # fixing misc formatting glitches + result = re.sub("\n- \n","\n-",result,flags=flags) # condensing newlines in bullet point lists + result = re.sub("\n \n ","",result,flags=flags) # condensate empty lines + result = re.sub("\n\n\n\(v",r" (v",result,flags=flags) # condensate badly formatted version templates + result = re.sub("\n:\n","",result,flags=flags) # condensate remaining : lines + result = re.sub("\n\n\n\n-","\n-",result,flags=flags) # condensate bad - lists + result = re.sub("(.*?)(\*.*?\*\n)",r"\1\n\2",result) # put captions on a newline + result = re.sub("\[(.*?)\]\(image:(.*?)\.md\)",r"![\1]("+imagepath+r"/\2)",result) # fix image: links + result = re.sub("\[(.*?)px\]\(File:(.*?)\.md\)",r'',result) # fix File: links + + + # removing all remaining templates + for template in re.findall("{{.*?}}",result,flags=flags): + if template.strip("{").strip("}").strip() in unusedtemplates: + result = re.sub(template,"",result,flags=flags) # remove all remaining templates + else: + print("WARNING: Unhandled template:",template) + + return result + + + def writeMarkdown(self,page,override=True,basepath=None,clean=True): + + """writeMarkdown(page,[override,basepath,clean]): + Writes the given page as a .md file. If basepath is + not given, file is written in the current dir. + If override is False, existing files are skipped. + If clean is false, raw pandoc output is returned""" + + if not basepath: + basepath = self.output + if not os.path.exists(basepath): + print("base path",basepath,"does not exist") + return page + result = self.getMarkdown(self.pagecontents[page],clean) + if not result: + print("Error writing page:",page) + return page + if "REDIRECT" in result.split("\n")[0]: + truepage = re.findall("\((.*?)\.md\)",result) + if not truepage: + truepage = re.findall("\((.*?)[\"\)]",result) + if not truepage[0].strip() in self.pagecontents: + print("Error:redirecting to",truepage[0].strip()) + return page + result = self.getMarkdown(self.pagecontents[truepage[0].strip()],clean) + if not result: + print("Error writing page:",page) + return page + else: + filename = page + if "/" in filename: + filename = os.path.join(self.output,self.translationfolder,page.split("/")[-1],page.split("/")[0]) + transpath = os.path.dirname(filename) + if not os.path.exists(transpath): + if not os.path.exists(os.path.join(self.output,self.translationfolder)): + os.mkdir(os.path.join(self.output,self.translationfolder)) + os.mkdir(transpath) + filename += ".md" + filename = os.path.join(basepath,filename) + if override or (not os.file.exists(filename)): + with open(filename,"w") as mdfile: + mdfile.write(result) + return None + + + def writeAllPages(self,override=True,basepath=None): + + """writeAllPages([override]): + Writes all pages to markdown files. Returns + a list of pages which couldn't be written for some reason. + If basepath is not given, file is written in the current dir. + If override is False, existing files are skipped""" + + errors = [] + count = 1 + if not basepath: + basepath = self.output + for page in self.pagenames: + self.printProgress(count,len(self.pagenames),"Saving page "+page+"...") + r = self.writeMarkdown(page,override,basepath) + if r: + errors.append(r) + count += 1 + self.printProgress() + return errors + + + +### GENERAL FUNCTIONS + + + +def init(): + + """performs an initial import""" + + wiki = MediaWiki() + wiki.init() + print("All done!") + + +def update(): + + """performs a full update of the local contents from the wiki""" + + wiki = MediaWiki() + wiki.update() + print("All done!") + + +def test(): + + """creates a couple of pages for testing""" + + w = MediaWiki() + w.writeMarkdown("Arch_Workbench") + w.writeMarkdown("Arch_Wall") + w.writeMarkdown("Draft_Line") + w.writeMarkdown("Part_Extrude") + w.writeMarkdown("Arch_Workbench",basepath=os.path.dirname(__file__)+"/orig",clean=False) + w.writeMarkdown("Arch_Wall",basepath=os.path.dirname(__file__)+"/orig",clean=False) + w.writeMarkdown("Draft_Line",basepath=os.path.dirname(__file__)+"/orig",clean=False) + w.writeMarkdown("Part_Extrude",basepath=os.path.dirname(__file__)+"/orig",clean=False) + + +def writepages(): + + """writes all pages to .md files""" + + wiki = MediaWiki() + #wiki.update() + wiki.writeAllPages() + + +def writeimages(): + + """downloads and wrie images. Existing images are skipped""" + + wiki = MediaWiki() + #wiki.update() + wiki.getAllImages() + + + + + + +### COMMAND-LINE USAGE + + + +if __name__ == "__main__": + + args = sys.argv[1:] + + # execute function + if len(args) == 1: + arg = args[0] + if arg.startswith("--"): + arg = arg[2:] + if arg in globals(): + if callable(globals()[arg]): + globals()[arg]() + exit(0) + + # print help text + funcs = " Available functions:\n" + for name in list(globals().keys()): + if (name != "MediaWiki") and callable(globals()[name]): + funcs += " --" + name + " : " + globals()[name].__doc__ + "\n" + print(__doc__.replace(" See bottom of file for available functions",funcs))