#! /usr/bin/python
# -*- coding: UTF-8 -*-
# ktouch_tfa_wikipedia-2.1.py by bnr 20091105
# ktouch touch typing lessons scraped from
# today's featured article from wikipedia 

import re
import os
import codecs
from xml.dom import minidom
from urllib import FancyURLopener

def main() :
	xmlFile = '/usr/share/kde4/apps/ktouch/wikipedia.ktouch.xml'
	address = "http://en.wikipedia.org/wiki/Main_Page"
	lessons = []
	getTFA(address, lessons)
	makeXmlFile(xmlFile, lessons) 

# get today's featured article
def getTFA(url, lessons) :
	
	# a custom urlopener with firefox2 user agent
	class MyOpener(FancyURLopener) :
		version = 'Mozilla/5.0 (Windows NT 5.1; U; en-US; rv:1.8.1) Gecko/20091102 Firefox/3.5.5'

	# get the wikipedia mainpage
	myopener = MyOpener()
	tfa = myopener.open(url).read()

	# parse out Todays Featured Article
	#dbOut = open('dbOut.txt', mode='w')
	#dbOut.write(tfa) #.split('<div id="mp-tfa" style="padding:2px 5px">\n')[1])
	#dbOut.close()
	#os.system('sudo chown zaphod.zaphod dbOut.txt')
	tfa = tfa.split('<div id="mp-tfa" style="padding:2px 5px">')[1].split('Recently featured:')[0]
	tfa = unicode(tfa, 'utf-8')
	tfa = re.sub('\n','',tfa)		# remove newline characters and	
	tfa = re.sub('&#160;',' ',tfa) 	# stupid non-bracketed html element
	p = re.compile(r'<.*?>');	
	tfa = p.sub('', tfa);			# strip out the html tags 
	p = re.compile(u"\u2013") 
	tfa = p.sub('-', tfa)			# replace long dash

	# break string into an array of strings ~70 characters long
	l = len(tfa) - 10 # drop ' (more...)' from end of excerpt
	c = 0
	while c <= l :
		oldc = c
		c += 70
		if oldc <= (l - 70) :
			while tfa[c] != ' ' :
				c += 1
			lessons.append(tfa[oldc:c].lstrip())
		else :
			lessons.append(tfa[oldc:l].lstrip())

# if editing existing xml remove whitespace (or else it grows)
def removeWhitespaceNodes(node) :
	removeList = []
	for child in node.childNodes :
		if child.nodeType == minidom.Node.TEXT_NODE :
			child.data = re.sub('\n','',child.data.strip())
			if not child.data.strip()  :
				removeList.append(child)
		elif child.hasChildNodes() :
			removeWhitespaceNodes(child)
	for node in removeList :
		node.parentNode.removeChild(node)
		node.unlink()
		
# create or read and rewrite xml file
def makeXmlFile(xmlFile, lessons) :
	# if xml file exists, read it
	if os.path.exists(xmlFile) :
		xmlObj = open(xmlFile, mode='rb') 
		doc = minidom.parse(xmlObj)
		xmlObj.close()
		removeWhitespaceNodes(doc)
	
		# the parent of the new level 
		levelsTag = doc.getElementsByTagName("Levels")[0]	
		lastLesson = levelsTag.childNodes[-1].childNodes[1].firstChild.toxml()

	else:	# if xml file does not exist create it
		lastLesson = ''
		doc = minidom.Document()
	
		# base Node
		ktl = doc.createElement("KTouchLecture")
		doc.appendChild(ktl)
	
		# first child of base Node
		titleTag = doc.createElement("Title")
		ktl.appendChild(titleTag)
		titleText = doc.createTextNode("Wikipedia Daily Featured Article (auto-generated)")
		titleTag.appendChild(titleText)
	
		# second child of base Node
		commentTag = doc.createElement("Comment")
		ktl.appendChild(commentTag)
		commentText = doc.createTextNode("KTouch training file generated by python")
		commentTag.appendChild(commentText)
	
		# third child of base Node
		fontTag = doc.createElement("FontSuggestions")
		ktl.appendChild(fontTag)
		fontText = doc.createTextNode("Monospace")
		fontTag.appendChild(fontText)
	
		# fourth child of base Node; parent of all Level Nodes
		levelsTag = doc.createElement("Levels")
		ktl.appendChild(levelsTag)
		
	# prevent the creation of duplicate lessons	
	if lastLesson != lessons[0] :
		# child of fourth child
		levelTag = doc.createElement("Level")
		levelsTag.appendChild(levelTag)
	
		# grandchild of fourth child; 
		newChars = doc.createElement("NewCharacters")
		levelTag.appendChild(newChars)
		newCharsText = doc.createTextNode(lessons[0][:36])
		newChars.appendChild(newCharsText)
	
		# more grandchildren of fourth child; our 'lesson lines'
		for line in lessons :
			lineTag = doc.createElement("Line")
			levelTag.appendChild(lineTag)
			lineText = doc.createTextNode(line)
			lineTag.appendChild(lineText)

	# output pretty XML and print to file
	output = doc.toprettyxml(indent="  ")
	outFile = codecs.open(xmlFile, mode='w', encoding='utf-8')
	outFile.write(output)
	outFile.close()

main()
