Protoball:Convert Ms Html WikiText.py: Difference between revisions

From Protoball
Jump to navigation Jump to search
No edit summary
No edit summary
Line 15: Line 15:
    if hasattr(soup, kill):
    if hasattr(soup, kill):
        del soup[kill]
        del soup[kill]
unwrap_list = ['p', 'span', 'o:p', 'st1:city', 'st1:state', 'st1:place', 'st1:placename', 'st1:placetype', 'st1:address', 'st1:street', 'st1:citation', 'st1:country-region', 'st2:city', 'st2:state', 'st2:place', 'st2:placename', 'st2:placetype', 'st2:address', 'st2:street', 'st2:country-region', 'st2:citation']
unwrap_list = ['span', 'o:p', 'st1:city', 'st1:state', 'st1:place', 'st1:placename', 'st1:placetype', 'st1:address', 'st1:street', 'st1:citation', 'st1:country-region', 'st2:city', 'st2:state', 'st2:place', 'st2:placename', 'st2:placetype', 'st2:address', 'st2:street', 'st2:country-region', 'st2:citation']
if soup.name in unwrap_list:
if soup.name in unwrap_list:
    soup.unwrap()
    soup.unwrap()

Revision as of 07:13, 10 September 2012

def convert_ms_html(text):
	from bs4 import BeautifulSoup
	from bs4 import NavigableString

	def clean_html(soup):
		for child in soup.children:
		    if isinstance(child, NavigableString):
		        child.replace_with(child.string.replace("\n", ' ').replace("  ", " "))
		    else:
		        clean_html(child)
		kill_list = ['class', 'style']
		for kill in kill_list:
		    if hasattr(soup, kill):
		        del soup[kill]
		unwrap_list = ['span', 'o:p', 'st1:city', 'st1:state', 'st1:place', 'st1:placename', 'st1:placetype', 'st1:address', 'st1:street', 'st1:citation', 'st1:country-region', 'st2:city', 'st2:state', 'st2:place', 'st2:placename', 'st2:placetype', 'st2:address', 'st2:street', 'st2:country-region', 'st2:citation']
		if soup.name in unwrap_list:
		    soup.unwrap()

	def remove_empty_tags(soup):
		for child in soup.contents:
		    if child.get_text(strip=True) == "":
		        child.decompose()

	def clean_soup(soup):
		clean_html(soup)
		remove_empty_tags(soup)
		remove_empty_tags(soup)

	import re
	year = re.compile("^\d{4}$")

	soup = BeautifulSoup(text)
	trs = soup.find_all('tr')
	f = open("/home/dave/Desktop/wikitext.txt", "w")
	first_year = True
	for tr in trs:
		strings = [x for x in tr.td.stripped_strings]
		if not len(strings) or strings[0] == "Date of Game":
			continue
		if year.match(strings[0]):
			if first_year:
				first_year = False
			else:
				f.write("""|}\n\n""")
			f.write("""== """ + strings[0] + """ ==

{| class=nice
! Date of Game  
! City/Town   Playing Field  
! Outcome  
! Sources
""")
			continue
		f.write("|-\n")
		for cell in tr.find_all('td'):
			clean_html(cell)
			f.write("| " + "".join([str(x) for x in cell.children]) + "\n")
	f.write("""|}\n\n""")
	f.close()