#!/usr/bin/python # # Script to create a confluence page # from sets import Set from urllib import * import cPickle import re import os import sys import urllib import xmlrpclib # This script is set up to create a confluence space corresponding # to your TWiki web (i.e. space) name. It will also use the # first page you create as the content for the space "Home" # -- otherwise confluence creates a "Home" page that isn't # linked to anything. # # Other notes: The first time you run this, it creates # a persistent copy of the entire twiki web you read in -- # which is then used for subsequent runs. This simplifies # running it repeatedly to tweak the transformation regexs' # But it prevents you from running the script against # multiple twiki webs. Just delete the ".pickle" file or # delete that bit of code in writeToConfluence() # # # Set these up to find your pages # twikiurlbase = "https://some.twikiwiki.com" twikibase = "/twiki/bin/view" twikistartspace = "MWdev" twikistartconcept = "WebHome" confluenceurlbase = "https://some.confluencewiki.com/confluence" twikilogin = "username" twikipass = "mypass" confluencelogin = "username" confluencepass = "mypass" class Page: def __init__( self, spaceName ): self._spaceName = spaceName self._content = None self._references = Set() self._confluenceRefName = self.confluenceName(spaceName[1]) self._imagelist = [] self._id = -1 self.parent = None self.blank = False self.listContext = "" def __repr__(self): return self._spaceName.__repr__() def __str__(self): return str(self._spaceName) def __eq__(self, other) : return self._spaceName.__eq__(other._spaceName) def __ne__(self, other) : return self._spaceName.__ne__(other._spaceName) def __hash__(self): return self._spaceName.__hash__() # Compile some patterns for efficiency noConceptPattern = re.compile(".*NOTE: This Wiki topic does not exist yet", re.DOTALL ) rawWikiContentPattern = \ re.compile("(.*)
]*>(.*).*", re.IGNORECASE | re.DOTALL ) # Parts of wiki content not searched for references # Some folks coded verbatim on our site very weirdly... ignoredTextPatterns = [ re.compile("(?s).*?"), re.compile("(?s)<verbatim>.*?</verbatim>"), re.compile("(?s).*?"), re.compile("(?s)<nop>.*?</nop>"), re.compile("!\w\w* "), re.compile("\w\w* ") ] nameRefsPatterns = [ re.compile("[^.]([A-Z][a-z][a-z]*[A-Z]\w*)"), re.compile("\\[\\[(\w\w*)\]\]") ] def setContent( self, rawContent ): match = Page.rawWikiContentPattern.match( rawContent ) if ( (not match) or len( match.groups() ) != 2 ): print "Error: " + str( self._spaceName ) + \ ": Raw form of content has unexpected form\n" + \ "Possible cause: non existent wiki link" self._content = None return frontmatter = match.group( 1 ) self._content = match.group( 2 ) # TWiki give standard message when concept is new if Page.noConceptPattern.match(self._content): self._content = None return # match the space and wiki concept references from the side bar refs = re.findall( twikibase + "(\w\w*)/(\w\w*)[^?]", frontmatter ) # Throw away references to some spaces def omit( tuple ): return tuple[0] not in ["TWiki", "Main"] refs = filter( omit, refs ) map(self._references.add, refs) # function to make a string into a tuple and then add to _references def addRefTuple( name ): self._references.add((self._spaceName[0], name)) # Don't search stuff inside or blocks for references searchable = self._content for pattern in Page.ignoredTextPatterns: searchable = pattern.sub("", searchable) for pattern in Page.nameRefsPatterns: namerefs = re.findall( pattern, searchable) map( addRefTuple, namerefs ) #Linking: #[title#anchor] Link a page #[dev:title#anchor] In space with 'dev' #[http://host.com] Remote link #[phrase@shortcut] Shortcut #Note: [alias|any_of_above_links] Custom link title # # Method to transform camelcase name into something more readable # Pass "[Name form]" unchanged, as "Name form" confluenceNamePattern = re.compile(r"([A-Z]+[a-z0-9]*) *") def confluenceName(self,name): return re.sub(Page.confluenceNamePattern,r"\1 ",name).strip() # # Methods for regex tranformations #see re.sub def nameTransform(self,match): if match.group(1): s = self.confluenceName(match.group(1)) else: s = self.confluenceName(match.group(2)) return '[' + s + ']' def imageTransform(self, match): # Append a tuple identifying the image source self._imagelist.append( (spaceName[0], match.group(1), match.group(2)) ) # Return a link to the image return "!" + match.group(2) + "!" # This should work for most well-formed twiki list nesting. Relatively # undefined if things get weird. def listTransform(self,match): # Lists gets either 1,2,3 as None with 4 containing a whole line # or 4 as None with 1,2,3 as the indent, bullet, and conten if 4 != len(match.groups()): print "Unexpected " + str(match.groups()) return "" wsp = match.group(1) bullet = match.group(2) content = match.group(3) line = match.group(4) if (line != None): if 0 == len(line.strip()): # Blank line, reset list context self.blank = True else: if self.blank: self.listContext = "" self.blank = False return line elif ( (wsp != None) & (bullet != None) & (content != None)): self.blank = False if bullet[0] in "0123456789": bullet = '#' indentation = len(wsp)/3 - 1 # Same or shallower nesting, possibly changing bullet (undefined) if (indentation <= len(self.listContext) ): self.listContext = self.listContext[0:indentation] + bullet # Deeper nesting, possibly changing bullet else: self.listContext = self.listContext + bullet return self.listContext + " " + content else: print "Unexpected: " + str(match.groups()) def getTransformedContent(self): # transformations should be defined outside the method with # compiled regexs, for efficiency, but the bound methods # listTransform, imageTransform, nameTransform use the state of # the page, so not immediately evident how to do that. # The transformation of twiki content into confluence content # The matching regex may have a replacement regex, and an action # ("regex-to-match", "replacement-regex", optional-function) # These substitutions are processed sequentially, so need to take # some care that don't substitute what has already been processed. # Hence use of | for some things transformations = [ # A concept reference, camel case or explicit (r"( [A-Z][a-z][a-z]*[A-Z]\w*)|\[\[([ \w][ \w]*)\]\]", self.nameTransform), # Some syntax transforms (r"( )__([^ ])", r"\1_*\2"), (r"([^ ])__( )", r"\1*_\2"), (r"(\s)=([^ =])", r"\1{{\2"), (r"([^ =])=([^A-Za-z0-9_\"]|$)", r"\1}}\2"), (r"([^ ])==( )", r"\1*}}\2"), (r"( )==([^ ])", r"\1{{*\2"), # Some tuning for our heading heavy pages -- # probably want a better style sheet (r"(?m)^---\+\+\+\+", r"h5. "), (r"(?m)^---\+\+\+", r"h4. "), (r"(?m)^---\+\+", r"h2. "), (r"(?m)^---\+", r"h1. "), (r"(?m)^----*", r"----"), # All kinds of lists, hopefully to allow nesting ( r"(?m)^((?: )+)(-|[*]|[0-9]+) (.*)$|^(.*)$", self.listTransform), # An image reference, with a page and image name. # Need to get and attach the image (r"", self.imageTransform), (r"", r"{code}"), (r"", r"{code}"), (r"", r"{noformat}"), (r"", r""), (r"

", r"\n\n" ), (r"
|%BR%", r"\\"), (r"([^A-Z])!([A-Z])", r"\1\2" ), # !WikiWord escaping in TWiki, isn't needed in Confluence # Some folks at our site did very weird things in the wiki content for some reason (r"<verbatim>", r"{code}"), (r"</verbatim>", r"{code}"), (r"</?pre>", r"{noformat}"), (r"</?nop>", r""), (r"<p>", r"\n\n" ), (r"<br>|%BR%", r"\\\\"), # Replace some of the most-used TWiki icons with Confluence equivalents (r"%T%", r"(on)" ), (r"%Y%", r"(/)" ), # % symbols are stored in the pickle as HTML/XML escape sequences? # Why? (r"%Y%", r"(/)" ), (r"%X%", r"(!)" ) ] if self._content == None: return None transformedContent = self._content for transform in transformations: transformedContent = \ re.sub(transform[0], transform[1], transformedContent) return transformedContent class CachedURLopener(FancyURLopener): def get_user_passwd(self, host, realm, clear_cache): return (twikilogin, twikipass) class Mapper: def __init__(self): self._mappedPages = Set() self._orderedPages = [] self.opener = CachedURLopener() # Add pages to a map, and maintains them in order read def addPage(self, page): # Hmmpf. add() doesn't have the nice property of telling if it added if page in self._mappedPages: return False self._mappedPages.add( page ) self._orderedPages.append( page ) return True def process(self, spaceName, parent): p = Page(spaceName) p._parent = parent if self.addPage( p ): try: fd = self.opener.open(twikiurlbase + twikibase + "/" + spaceName[0] + "/" + spaceName[1] + "?raw=1") print "Reading " + str(p._spaceName) content = fd.read() fd.close() p.setContent(content) # Recurse into pages referred to by current page for ref in p._references: self.process( ref, p) except: # TWiki allows lots of complex ways to link, and also lots of # complex ways to *not* link (example, putting text inside a # =monospace= line). So, if we fail to open a page, we assume # it's just a mistake with trying to follow a non-reference. print "Problem reading",p._spaceName,"... probably wasn't a real reference." def writeToConfluence(self): # Write the Set of twiki pages into confluence s = xmlrpclib.Server(confluenceurlbase + "/rpc/xmlrpc") token = s.confluence1.login(confluencelogin, confluencepass) for p in self._orderedPages: space = p._spaceName[0] name = p._confluenceRefName existingspace = None newspace = None homepage = None try: existingspace = s.confluence1.getSpace(token, space) except xmlrpclib.Fault: # Omit the description and homepage. Maybe make homepage # first page created? spaceobj = { 'key': space, 'name': space, 'url' : confluenceurlbase + "/display/" + space, } newspace = s.confluence1.addSpace(token, spaceobj) homepage = newspace['homePage'] content = p.getTransformedContent() if (content): print "Storing " + str(p._spaceName) + " as '" + name + "'" page = { 'creator':'twikiToConfluence', 'url': confluenceurlbase + "/display/" + space + "/" + re.sub(' ','+',name), 'title': name, 'space': space, 'content': content } # Changed so first page created in a newspace # is the home page. Can't change title # so links back to "Web Home" will be broken. if homepage: page = s.confluence1.getPage(token, homepage) print page page['url'] = confluenceurlbase + "/display/" + \ space + "/" + re.sub(' ','+',name) # This doesn't seem to be allowed. #page['title'] = name page['content'] = content if p._parent: if p._parent._id != -1 : page['parentId'] = p._parent._id page = s.confluence1.storePage(token, page) p._id = page.get("id") for image in p._imagelist: # Doh. There is no rpc for attachments print "Attach the image from " + \ twikiurlbase + twikibase + "/" + \ image[0] + "/" + image[1] + "/" + image[2] print " to the Confluence page " + \ p._confluenceRefName + " as " + image[2] # #1 Get the image # fd = self.opener.open(twikiurlbase + twikibase + "/" + # image[0] + "/" + image[1] + "/" + image[2] ) # imagedata = fd.read() # fd.close() # Use xmlrpc to add an attachment with content imagedata # Doh. There is no rpc for attachments # # Create a Mapper that loads the start page, and follows any references it finds # # Save/restore the loaded twiki in a pickle # This saves the loaded twiki to allow tuning all the # transformation regex's without reloading. if os.path.exists("twikimapper.pickle"): dumpfile = open("twikimapper.pickle","r") twikimapper = cPickle.load(dumpfile) dumpfile.close() else: twikimapper = Mapper() print "connecting to TWiki..." twikimapper.process((twikistartspace,twikistartconcept), None) dumpfile = open("twikimapper.pickle","w") cPickle.dump(twikimapper, dumpfile) dumpfile.close() #twikimapper = Mapper() #twikimapper.process((twikistartspace,twikistartconcept), None) print "connecting to Confluence..." twikimapper.writeToConfluence()