Protoball:Convert Ms Html WikiText.py
Jump to navigation
Jump to search
def convert_ms_html(text): from bs4 import BeautifulSoup from bs4 import NavigableString def clean_html(soup): for child in soup.children: if isinstance(child, NavigableString): child.replace_with(child.string.replace("\n", ' ').replace(" ", " ")) else: clean_html(child) kill_list = ['class', 'style'] for kill in kill_list: if hasattr(soup, kill): del soup[kill] unwrap_list = ['span', 'o:p', 'st1:city', 'st1:state', 'st1:place', 'st1:placename', 'st1:placetype', 'st1:address', 'st1:street', 'st1:citation', 'st1:country-region', 'st2:city', 'st2:state', 'st2:place', 'st2:placename', 'st2:placetype', 'st2:address', 'st2:street', 'st2:country-region', 'st2:citation'] if soup.name in unwrap_list: soup.unwrap() def remove_empty_tags(soup): for child in soup.contents: if child.get_text(strip=True) == "": child.decompose() def clean_soup(soup): clean_html(soup) remove_empty_tags(soup) remove_empty_tags(soup) import re year = re.compile("^\d{4}$") soup = BeautifulSoup(text) trs = soup.find_all('tr') f = open("/home/dave/Desktop/wikitext.txt", "w") first_year = True for tr in trs: strings = [x for x in tr.td.stripped_strings] if not len(strings) or strings[0] == "Date of Game": continue if year.match(strings[0]): if first_year: first_year = False else: f.write("""|}\n\n""") f.write("""== """ + strings[0] + """ == {| class=nice ! Date of Game ! City/Town Playing Field ! Outcome ! Sources """) continue f.write("|-\n") for cell in tr.find_all('td'): clean_html(cell) f.write("| " + "".join([str(x) for x in cell.children]) + "\n") f.write("""|}\n\n""") f.close()