Protoball:Convert Html From Pdf to WikiText.py: Difference between revisions

From Protoball
Jump to navigation Jump to search
(Created page with "Category:Python Scripts <pre> def convert_html(): from bs4 import BeautifulSoup from bs4 import NavigableString f = open("/home/dave/Desktop/GT_NYC.html") ...")
 
No edit summary
 
Line 1: Line 1:
[[Category:Python Scripts]]
[[Category:Python Scripts]]
<pre>
<pre>
def convert_html():
def convert_html(text):
     from bs4 import BeautifulSoup
     from bs4 import BeautifulSoup
     from bs4 import NavigableString
     from bs4 import NavigableString
    f = open("/home/dave/Desktop/GT_NYC.html")
     soup = BeautifulSoup(text)
    t = f.read()
     soup = BeautifulSoup(t)
    f.close()
     ps = soup.find_all('p')
     ps = soup.find_all('p')
     for p in ps:
     for p in ps:
Line 26: Line 23:


     trs = soup.find_all('tr')
     trs = soup.find_all('tr')
     f = open("/home/dave/Desktop/nyc.txt", "w")
     f = open("/home/dave/Desktop/wikitext.txt", "w")
     for tr in trs:
     for tr in trs:
    strings = [x for x in tr.td.stripped_strings]
    strings = [x for x in tr.td.stripped_strings]

Latest revision as of 09:02, 7 July 2012

def convert_html(text):
    from bs4 import BeautifulSoup
    from bs4 import NavigableString
    soup = BeautifulSoup(text)
    ps = soup.find_all('p')
    for p in ps:
        p.unwrap()
    italics = soup.select('span[style*="italic"]')
    for span in italics:
	    span.wrap(soup.new_tag("i"))
    spans = soup.find_all('span')
    for span in spans:
        span.unwrap()

    def clean_html(soup):
        for child in soup.children:
            if isinstance(child, NavigableString):
                child.replace_with(child.string.replace("\n", ' ').replace("  ", " "))
            else:
                clean_html(child)

    trs = soup.find_all('tr')
    f = open("/home/dave/Desktop/wikitext.txt", "w")
    for tr in trs:
	    strings = [x for x in tr.td.stripped_strings]
	    if not len(strings):
		    continue	
	    f.write("|-\n")
	    for cell in tr.find_all('td'):
		    clean_html(cell)
		    f.write("| " + "".join([str(x) for x in cell.children]) + "\n")
    f.close()