Protoball:Convert Html From Pdf to WikiText.py: Difference between revisions
Jump to navigation
Jump to search
(Created page with "Category:Python Scripts <pre> def convert_html(): from bs4 import BeautifulSoup from bs4 import NavigableString f = open("/home/dave/Desktop/GT_NYC.html") ...") |
No edit summary |
||
Line 1: | Line 1: | ||
[[Category:Python Scripts]] | [[Category:Python Scripts]] | ||
<pre> | <pre> | ||
def convert_html(): | def convert_html(text): | ||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||
from bs4 import NavigableString | from bs4 import NavigableString | ||
soup = BeautifulSoup(text) | |||
soup = BeautifulSoup( | |||
ps = soup.find_all('p') | ps = soup.find_all('p') | ||
for p in ps: | for p in ps: | ||
Line 26: | Line 23: | ||
trs = soup.find_all('tr') | trs = soup.find_all('tr') | ||
f = open("/home/dave/Desktop/ | f = open("/home/dave/Desktop/wikitext.txt", "w") | ||
for tr in trs: | for tr in trs: | ||
strings = [x for x in tr.td.stripped_strings] | strings = [x for x in tr.td.stripped_strings] |
Latest revision as of 09:02, 7 July 2012
def convert_html(text): from bs4 import BeautifulSoup from bs4 import NavigableString soup = BeautifulSoup(text) ps = soup.find_all('p') for p in ps: p.unwrap() italics = soup.select('span[style*="italic"]') for span in italics: span.wrap(soup.new_tag("i")) spans = soup.find_all('span') for span in spans: span.unwrap() def clean_html(soup): for child in soup.children: if isinstance(child, NavigableString): child.replace_with(child.string.replace("\n", ' ').replace(" ", " ")) else: clean_html(child) trs = soup.find_all('tr') f = open("/home/dave/Desktop/wikitext.txt", "w") for tr in trs: strings = [x for x in tr.td.stripped_strings] if not len(strings): continue f.write("|-\n") for cell in tr.find_all('td'): clean_html(cell) f.write("| " + "".join([str(x) for x in cell.children]) + "\n") f.close()