Protoball:Convert Html From Pdf to WikiText.py

From Protoball
Jump to navigation Jump to search
def convert_html(text):
    from bs4 import BeautifulSoup
    from bs4 import NavigableString
    soup = BeautifulSoup(text)
    ps = soup.find_all('p')
    for p in ps:
        p.unwrap()
    italics = soup.select('span[style*="italic"]')
    for span in italics:
	    span.wrap(soup.new_tag("i"))
    spans = soup.find_all('span')
    for span in spans:
        span.unwrap()

    def clean_html(soup):
        for child in soup.children:
            if isinstance(child, NavigableString):
                child.replace_with(child.string.replace("\n", ' ').replace("  ", " "))
            else:
                clean_html(child)

    trs = soup.find_all('tr')
    f = open("/home/dave/Desktop/wikitext.txt", "w")
    for tr in trs:
	    strings = [x for x in tr.td.stripped_strings]
	    if not len(strings):
		    continue	
	    f.write("|-\n")
	    for cell in tr.find_all('td'):
		    clean_html(cell)
		    f.write("| " + "".join([str(x) for x in cell.children]) + "\n")
    f.close()