# MySpace parser # Tom Dyson, July 2006 # http://www.throwingbeans.org/ # Requires BeautifulSoup, tested on version 3: # http://www.crummy.com/software/BeautifulSoup/ # TODO: distinguish band sites from personal sites # usage: # from MySpace import * # n = MySpace('soundofnorthwood') # total_friends = n.friend_count # last_comment = n.comments[0] __version__ = "0.2" __license__ = "GPL 2" __author__ = "Tom Dyson" __contributors__ = "Tim Hatch (http://timhatch.com/)" __all__ = ["MySpace"] import BeautifulSoup import re import urllib # Entity decoding from SBP's getlinks.py import htmlentitydefs htmlentitydefs.name2codepoint['apos'] = 0x27 r_entity = re.compile(r'&(#x[0-9A-Fa-f]+|#[0-9]+|[A-Za-z]+);') def entity(m): name = m.group(1) if name.startswith('#x'): return unichr(int(name[2:].lstrip('0'), 16)) elif name.startswith('#'): return unichr(int(name[1:].lstrip('0'))) elif htmlentitydefs.name2codepoint.has_key(name): return unichr(htmlentitydefs.name2codepoint[name]) else: return '&' + name + ';' def decode_entities(s): return r_entity.sub(entity, s) SPACE_STUB = 'http://www.myspace.com/' FRIEND_STUB = 'http://profile.myspace.com/index.cfm?fuseaction=user.viewprofile&friendid=' FRIEND_HREF_ID = re.compile('ctl00_Main_ctl00_UserFriends1_F') ADD_FRIEND_ID = 'ctl00_Main_ctl00_UserContactLinks1_AddFriendLink' COMMENT_TD = {'align':'center', 'valign':'top', 'width':'150', 'bgcolor':'FF9933', 'style':'word-wrap: break-word'} class MySpace(object): def __init__(self, myspace_name): # get the HTML for MySpace URLs or numerical IDs if myspace_name.isdigit(): html = urllib.urlopen(FRIEND_STUB + myspace_name).read() else: html = urllib.urlopen(SPACE_STUB + myspace_name).read() self.soup = BeautifulSoup.BeautifulSoup(html) # find MySpace name self.name = self.soup.first('span','nametext').string # find MySpace URL title = self.soup.title.string.strip() title = title.replace('www.myspace.com/','') if len(title): self.url = title else: self.url = None # find MySpace numerical ID add_friend_href = self.soup.first('a',id=ADD_FRIEND_ID)['href'] self.id = re.search('friendID=([0-9]+)', add_friend_href).group(1) # find number of friends self.friend_count = int(self.soup.first('span','redbtext').string) # find Friends friend_links = self.soup.findAll('a', id=FRIEND_HREF_ID) self.top_friends = [] for link in friend_links: if link.string and len(link.string.strip()): friend_name = decode_entities(link.string.strip()) friend_id = link['href'].replace(FRIEND_STUB, '') self.top_friends.append((friend_name, friend_id)) # find influences try: self.influences = self.soup.first('td',id='ProfileInfluences').string except: self.influences = None # get comments self.comments=[] for td in self.soup.findAll("td", attrs = COMMENT_TD): comment_link = td.first('a') commenter = decode_entities(comment_link.string.strip()) comment_href = comment_link['href'] commenter_id = re.search('friendID=([0-9]+)', comment_href, re.IGNORECASE).group(1) comment_td = td.nextSibling.nextSibling comment_date = comment_td.first('span','blacktext10').string.strip() comment_lines = [] for node in comment_td.contents: if len(node): try: # look into why this is failing comment_line = node.string.strip() if comment_date not in comment_line: comment_lines.append(comment_line) except: pass comment = ' '.join(comment_lines).strip() self.comments.append((commenter, commenter_id, comment_date, comment))