Hyphenation can make your justified text prettier by minimization of space between words. Input of this simple program is XML and output is XML with hyphenated every word longer than 5 characters. Hyphenation is done by Open Office hyphenation library. Program is adding the ­ entity to words which is working in some browsers. Please be aware it's not optimized code, but as far as it's used relatively rarely (on insert or update not on select) it is not worth optimization in most of implementations.
#!/usr/bin/python
from hyphen import hyphenator
from hyphen.dictools import *
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
lang="pl_PL"
if not is_installed(lang):
install(lang)
h_pl = hyphenator('pl_PL')
class XMLHyphenator(ContentHandler):
def __init__ (self):
self.isPointsElement, self.isReboundsElement = 0, 0
self.outputXML=[]
def startElement(self, name, attrs):
self.outputXML.append("<"+name)
for i in attrs.keys():
self.outputXML.append(" "+i+"=\""+attrs.get(i,"")+"\"")
self.outputXML.append(">")
def endElement(self, name):
self.outputXML.append("</"+name+">")
return
def characters (self, ch):
t=ch.split(" ")
for i in t:
s=i
if i.__len__()>5:
#hyphenator treats comas as part of a word
if i[len(i)-1]==",":
i=h_pl.inserted(i.replace(",",""))+","
else:
i=h_pl.inserted(i)
self.outputXML.append(i.replace("=","­")+" ")
parser = make_parser()
x=XMLHyphenator()
parser.setContentHandler(x)
parser.parse(open('haze.html'))
print "".join(x.outputXML)
