Protoball:Convert Ms Html WikiText.py: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
No edit summary |
||
Line 4: | Line 4: | ||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||
from bs4 import NavigableString | from bs4 import NavigableString | ||
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] | |||
def add_year(line, year): | |||
index = 100 | |||
for month in months: | |||
if month in line: | |||
index = min(index, line.find(month)) | |||
line = line[0:index+3] + ", " + year + line[index+3:] | |||
return line | |||
def clean_html(soup): | def clean_html(soup): | ||
Line 41: | Line 51: | ||
continue | continue | ||
if year.match(strings[0]): | if year.match(strings[0]): | ||
year = strings[0] | |||
if first_year: | if first_year: | ||
first_year = False | first_year = False | ||
Line 55: | Line 66: | ||
continue | continue | ||
f.write("|-\n") | f.write("|-\n") | ||
is_year_field = True | |||
for cell in tr.find_all('td'): | for cell in tr.find_all('td'): | ||
clean_html(cell) | clean_html(cell) | ||
text = "".join([str(x) for x in cell.children]) | |||
if is_year_field: | |||
is_year_field = False | |||
text = add_year(text, year) | |||
f.write("| " + text + "\n") | |||
f.write("""|}\n\n""") | f.write("""|}\n\n""") | ||
f.close() | f.close() | ||
</pre> | </pre> |
Revision as of 07:34, 10 September 2012
def convert_ms_html(text): from bs4 import BeautifulSoup from bs4 import NavigableString months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] def add_year(line, year): index = 100 for month in months: if month in line: index = min(index, line.find(month)) line = line[0:index+3] + ", " + year + line[index+3:] return line def clean_html(soup): for child in soup.children: if isinstance(child, NavigableString): child.replace_with(child.string.replace("\n", ' ').replace(" ", " ")) else: clean_html(child) kill_list = ['class', 'style'] for kill in kill_list: if hasattr(soup, kill): del soup[kill] unwrap_list = ['span', 'o:p', 'st1:city', 'st1:state', 'st1:place', 'st1:placename', 'st1:placetype', 'st1:address', 'st1:street', 'st1:citation', 'st1:country-region', 'st2:city', 'st2:state', 'st2:place', 'st2:placename', 'st2:placetype', 'st2:address', 'st2:street', 'st2:country-region', 'st2:citation'] if soup.name in unwrap_list: soup.unwrap() def remove_empty_tags(soup): for child in soup.contents: if child.get_text(strip=True) == "": child.decompose() def clean_soup(soup): clean_html(soup) remove_empty_tags(soup) remove_empty_tags(soup) import re year = re.compile("^\d{4}$") soup = BeautifulSoup(text) trs = soup.find_all('tr') f = open("/home/dave/Desktop/wikitext.txt", "w") first_year = True for tr in trs: strings = [x for x in tr.td.stripped_strings] if not len(strings) or strings[0] == "Date of Game": continue if year.match(strings[0]): year = strings[0] if first_year: first_year = False else: f.write("""|}\n\n""") f.write("""== """ + strings[0] + """ == {| class=nice ! Date of Game ! City/Town Playing Field ! Outcome ! Sources """) continue f.write("|-\n") is_year_field = True for cell in tr.find_all('td'): clean_html(cell) text = "".join([str(x) for x in cell.children]) if is_year_field: is_year_field = False text = add_year(text, year) f.write("| " + text + "\n") f.write("""|}\n\n""") f.close()