Protoball:Mlb chron to import csv.py: Difference between revisions
Jump to navigation
Jump to search
(Created page with "Category:Python Scripts <pre> def mlb_chron_to_cvs(): class Entry(): def __init__(self): self.headline = "" self.year = 0 self.ID = "" self.description = ""...") |
m (Dave moved page Mlb chron to import csv.py to Protoball:Mlb chron to import csv.py without leaving a redirect) |
(No difference)
|
Latest revision as of 07:36, 24 July 2012
def mlb_chron_to_cvs(): class Entry(): def __init__(self): self.headline = "" self.year = 0 self.ID = "" self.description = "" self.tags = [] self.location = "" self.game = "" self.suffix = "" def prep(self): self.clean_ID_and_assign_suffix() self.clean_description() self.distribute_tags() def clean_ID_and_assign_suffix(self): self.ID = self.ID.replace('C.', 'c.').replace('S.', 's.') if 'c.' in self.ID: self.suffix = 'c' elif 's.' in self.ID: self.suffix = 's' def clean_description(self): from bs4 import BeautifulSoup soup = BeautifulSoup(self.description) for p in soup.find_all("p"): if hasattr(p, "class"): del p["class"] if soup.html: soup.html.unwrap() soup.body.unwrap() self.description = str(soup) def distribute_tags(self): locations = ['California', 'Canada', 'Harvard', 'Illinois', 'New England', 'New Jersey', 'Philadelphia', 'South', 'Wisconsin', 'Western New York', 'Texas'] games = ['Bat-Ball', 'Cricket', 'Oddball', 'Stoolball', 'Town Ball', 'Wicket', 'Xenoball', 'Rounders', 'Base Ball'] sub = {} sub['WNY'] = 'Western New York' sub['PreKnicks'] = 'Pre-Knicks' sub['NewEngland'] = 'New England' sub['AfricanAmericans'] = 'African Americans' sub['NewJersey'] = 'New Jersey' sub['TownBall'] = 'Town Ball' sub['BaseBall'] = 'Base Ball' sub['BatBall'] = 'Bat-Ball' sub['IllinoisMissouri'] = 'Illinois' tags = list(self.tags) self.tags = [] for tag in tags: tag = sub.get(tag, tag) if tag in locations: self.location = tag elif tag in games: self.game = tag else: self.tags.append(tag) from urllib import request from bs4 import BeautifulSoup date_ranges = ["1000-1500", "1501-1700", "1701-1800", "1801-1825", "1826-1870"] entries = [] for date_range in date_ranges: r = request.urlopen("http://mlb.mlb.com/gen/hb/chron/{0}.xml".format(date_range)) t = r.read() soup = BeautifulSoup(t, "xml") items = soup.find_all("item") for item in items: entry = Entry() entry.headline = item.select("field[key=title]")[0].string entry.ID = item.select("field[key=displayYear]")[0].string entry.description = item.select("field[key=description]")[0].string if not entry.description: continue entry.year = int(item.select("field[key=year]")[0].string) entry.tags = [itemTag['displayName'] for itemTag in item.find_all("itemTag")] entry.prep() entries.append(entry) import csv rows = [] rows.append(['Title', 'Chronology Entry[Headline]', 'Chronology Entry[Year]', 'Chronology Entry[Year Suffix]', 'Chronology Entry[Is in main chronology]', 'Chronology Entry[Location]', 'Chronology Entry[Game]', 'Chronology Entry[Tags]', 'Chronology Entry[Text]']) (title, headline, year, suffix, in_main, location, game, tags, text) = (0, 1, 2, 3, 4, 5, 6, 7, 8) for entry in entries: row = ["", "", "", "", "", "", "", "", ""] row[title] = entry.ID row[headline] = entry.headline row[year] = entry.year row[suffix] = entry.suffix row[in_main] = "yes" row[location] = entry.location row[game] = entry.game row[tags] = ",".join(entry.tags) row[text] = entry.description rows.append(row) f = open("/home/dave/Desktop/chron_import.csv", 'w') writer = csv.writer(f) for row in rows: writer.writerow(row) f.close()