Protoball:Convert Html From Pdf to WikiText.py
Jump to navigation
Jump to search
def convert_html(text): from bs4 import BeautifulSoup from bs4 import NavigableString soup = BeautifulSoup(text) ps = soup.find_all('p') for p in ps: p.unwrap() italics = soup.select('span[style*="italic"]') for span in italics: span.wrap(soup.new_tag("i")) spans = soup.find_all('span') for span in spans: span.unwrap() def clean_html(soup): for child in soup.children: if isinstance(child, NavigableString): child.replace_with(child.string.replace("\n", ' ').replace(" ", " ")) else: clean_html(child) trs = soup.find_all('tr') f = open("/home/dave/Desktop/wikitext.txt", "w") for tr in trs: strings = [x for x in tr.td.stripped_strings] if not len(strings): continue f.write("|-\n") for cell in tr.find_all('td'): clean_html(cell) f.write("| " + "".join([str(x) for x in cell.children]) + "\n") f.close()