Protoball:Convert Ms Html WikiText.py

From Protoball
Jump to navigation Jump to search
def convert_ms_html(text):
	from bs4 import BeautifulSoup
	from bs4 import NavigableString

	months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

	def add_year(line, year):
		index = 100
		for month in months:
			if month in line:
				index = min(index, line.find(month))
		line = line[0:index+3] + ", " + year + line[index+3:]
		return line

	def clean_html(soup):
		for child in soup.children:
		    if isinstance(child, NavigableString):
		        child.replace_with(child.string.replace("\n", ' ').replace("  ", " "))
		    else:
		        clean_html(child)
		kill_list = ['class', 'style', 'align']
		for kill in kill_list:
		    if hasattr(soup, kill):
		        del soup[kill]
		unwrap_list = ['span', 'o:p', 'st1:city', 'st1:state', 'st1:place', 'st1:placename', 'st1:placetype', 'st1:address', 'st1:street', 'st1:citation', 'st1:country-region', 'st2:city', 'st2:state', 'st2:place', 'st2:placename', 'st2:placetype', 'st2:address', 'st2:street', 'st2:country-region', 'st2:citation']
		if soup.name in unwrap_list:
		    soup.unwrap()

	def remove_empty_tags(soup):
		for child in soup.contents:
		    if child.get_text(strip=True) == "":
		        child.decompose()

	def clean_soup(soup):
		clean_html(soup)
		remove_empty_tags(soup)
		remove_empty_tags(soup)

	import re
	year_re = re.compile("^\d{4}$")

	soup = BeautifulSoup(text)
	trs = soup.find_all('tr')
	f = open("/home/dave/Desktop/wikitext.txt", "w")
	first_year = True
	for tr in trs:
		strings = [x for x in tr.td.stripped_strings]
		if not len(strings) or strings[0] == "Date of Game":
			continue
		if year_re.match(strings[0]):
			year = strings[0]
			if first_year:
				first_year = False
			else:
				f.write("""|}\n\n""")
			f.write("""== """ + strings[0] + """ ==

{| class=nice
! Date of Game  
! City/Town   Playing Field  
! Outcome  
! Sources
""")
			continue
		f.write("|-\n")
		is_year_field = True
		for cell in tr.find_all('td'):
			clean_html(cell)
			text = "".join([str(x) for x in cell.children])
			text = text.replace("<p> ", "<p>").replace(" </p>",  "</p>").replace("<p></p>", "")
			if is_year_field:
				is_year_field = False
				text = add_year(text, year)
			f.write("| " + text + "\n")
	f.write("""|}\n\n""")
	f.close()