Protoball:Convert Ms Html WikiText.py
Jump to navigation
Jump to search
def convert_ms_html(text): from bs4 import BeautifulSoup from bs4 import NavigableString months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] def add_year(line, year): index = 100 for month in months: if month in line: index = min(index, line.find(month)) line = line[0:index+3] + ", " + year + line[index+3:] return line def clean_html(soup): for child in soup.children: if isinstance(child, NavigableString): child.replace_with(child.string.replace("\n", ' ').replace(" ", " ")) else: clean_html(child) kill_list = ['class', 'style', 'align'] for kill in kill_list: if hasattr(soup, kill): del soup[kill] unwrap_list = ['span', 'o:p', 'st1:city', 'st1:state', 'st1:place', 'st1:placename', 'st1:placetype', 'st1:address', 'st1:street', 'st1:citation', 'st1:country-region', 'st2:city', 'st2:state', 'st2:place', 'st2:placename', 'st2:placetype', 'st2:address', 'st2:street', 'st2:country-region', 'st2:citation'] if soup.name in unwrap_list: soup.unwrap() def remove_empty_tags(soup): for child in soup.contents: if child.get_text(strip=True) == "": child.decompose() def clean_soup(soup): clean_html(soup) remove_empty_tags(soup) remove_empty_tags(soup) import re year_re = re.compile("^\d{4}$") soup = BeautifulSoup(text) trs = soup.find_all('tr') f = open("/home/dave/Desktop/wikitext.txt", "w") first_year = True for tr in trs: strings = [x for x in tr.td.stripped_strings] if not len(strings) or strings[0] == "Date of Game": continue if year_re.match(strings[0]): year = strings[0] if first_year: first_year = False else: f.write("""|}\n\n""") f.write("""== """ + strings[0] + """ == {| class=nice ! Date of Game ! City/Town Playing Field ! Outcome ! Sources """) continue f.write("|-\n") is_year_field = True for cell in tr.find_all('td'): clean_html(cell) text = "".join([str(x) for x in cell.children]) text = text.replace("<p> ", "<p>").replace(" </p>", "</p>").replace("<p></p>", "") if is_year_field: is_year_field = False text = add_year(text, year) f.write("| " + text + "\n") f.write("""|}\n\n""") f.close()