diff options
-rwxr-xr-x | gg_scraper.py | 99 | ||||
-rw-r--r-- | test/test_unit.py | 2 |
2 files changed, 54 insertions, 47 deletions
diff --git a/gg_scraper.py b/gg_scraper.py index 50c437a..2c24082 100755 --- a/gg_scraper.py +++ b/gg_scraper.py @@ -24,17 +24,19 @@ try: from collections import OrderedDict except ImportError: from ordereddict import OrderedDict -import operator try: from configparser import ConfigParser except ImportError: from ConfigParser import ConfigParser +import logging import mailbox +import operator import os.path import re import shutil import subprocess import sys + import yaml try: from urllib.error import HTTPError @@ -43,13 +45,14 @@ try: except ImportError: from urllib2 import (HTTPError, HTTPHandler, HTTPRedirectHandler, build_opener) + +from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed try: from queue import Queue except ImportError: - from Queue import Queue -from bs4 import BeautifulSoup -import logging + from Queue import Queue # noqa + logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', level=logging.DEBUG) @@ -65,6 +68,8 @@ pyver = sys.version_info py26 = pyver[:2] < (2, 7) py3k = pyver[0] == 3 +log = logging.getLogger('gg_scraper') + class BadURLError(ValueError): pass @@ -81,28 +86,28 @@ class Page(object): pass @staticmethod - def unenscape_Google_bang_URL(old_URL): + def unescape_google_bang_url(old_url): """ See https://developers.google.com/webmasters\ /ajax-crawling/docs/getting-started for more information """ - if old_URL.find('#!') != -1: - return old_URL.replace('#!', '?_escaped_fragment_=') - elif old_URL.startswith('https://groups.google.com/d/topic/'): + if old_url.find('#!') != -1: + return old_url.replace('#!', '?_escaped_fragment_=') + elif old_url.startswith('https://groups.google.com/d/topic/'): # DEBUG:get_one_topic:URL collected = # https://groups.google.com/d/topic/jbrout/dreCkob3KSs # DEBUG:__init__:root_URL = # https://groups.google.com/forum/\ # ?_escaped_fragment_=topic/jbrout/dreCkob3KSs - return old_URL.replace( + return old_url.replace( 'https://groups.google.com/d/', 'https://groups.google.com/forum/?_escaped_fragment_=' ) else: - return old_URL + return old_url - def _get_page_BS(self, URL): - res = self.opener.open(self.unenscape_Google_bang_URL(URL)) + def _get_page_bs(self, url): + res = self.opener.open(self.unescape_google_bang_url(url)) in_str = res.read() bs = BeautifulSoup(in_str) res.close() @@ -110,13 +115,13 @@ class Page(object): class Article(Page): - def __init__(self, URL): + def __init__(self, url): super(Article, self).__init__() - self.root = URL.replace('d/msg/', 'forum/message/raw?msg=') + self.root = url.replace('d/msg/', 'forum/message/raw?msg=') self.raw_message = '' def collect_message(self): - logging.debug('self.root = {0}'.format(self.root)) + log.debug('self.root = {0}'.format(self.root)) result = None try: res = self.opener.open(self.root) @@ -127,7 +132,6 @@ class Article(Page): proc = subprocess.Popen(['/usr/bin/formail'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) - #universal_newlines=True) if not(py3k and isinstance(raw_msg, bytes)): raw_msg = raw_msg.encode('utf8') result = proc.communicate(raw_msg)[0] @@ -142,11 +146,11 @@ class Article(Page): class Topic(Page): - def __init__(self, URL, name): + def __init__(self, url, name): super(Topic, self).__init__() self.name = name - root_URL = self.unenscape_Google_bang_URL(URL) - self.root = root_URL + root_url = self.unescape_google_bang_url(url) + self.root = root_url self.articles = [] def __unicode__(self): @@ -160,8 +164,8 @@ class Topic(Page): '''Get total number of articles from the number on the page itself. ''' - BS = self._get_page_BS(self.root) - i_elem = BS.find_all('i') + bs = self._get_page_bs(self.root) + i_elem = bs.find_all('i') if len(i_elem) <= 0: raise ValueError('Cannot find count of topics!') @@ -170,7 +174,7 @@ class Topic(Page): def get_articles(self): out = [] - page = self._get_page_BS(self.root) + page = self._get_page_bs(self.root) for a_elem in page.find_all('a'): if 'href' in a_elem.attrs: a_href = a_elem['href'] @@ -182,28 +186,30 @@ class Topic(Page): class Group(Page): - GOOD_URL_RE = re.compile(r'https://groups.google.com/forum/#!forum/(.+)') + good_url_re = re.compile(r'https://groups.google.com/forum/#!forum/(.+)') - def __init__(self, URL): + def __init__(self, url): super(Group, self).__init__() - self.group_URL = URL + self.group_URL = url self.topics = [] - match = self.GOOD_URL_RE.match(URL) - logging.debug('match = %s', match) + match = self.good_url_re.match(url) + log.debug('match = %s', match) if match is None: - raise BadURLError("Required URL in form 'https://groups.google.com/forum/#!forum/GROUPNAME'") + raise BadURLError( + "Required url in form " + + "'https://groups.google.com/forum/#!forum/GROUPNAME'") self.name = match.group(1) @staticmethod - def get_count_topics(BS): + def get_count_topics(bs): '''Get total number of topics from the number on the page itself. Which would be awesome for control, except it is wrong on all pages in various and different ways. :( ''' - i_elem = BS.find_all('i') + i_elem = bs.find_all('i') if len(i_elem) <= 0: raise ValueError('Cannot find count of topics!') @@ -214,12 +220,12 @@ class Group(Page): def get_one_topic(elem): sys.stdout.write('. ') sys.stdout.flush() - #logging.debug('URL collected = {0}'.format(elem['href'])) + # log.debug('URL collected = {0}'.format(elem['href'])) if 'title' in elem.attrs: # filter out all-non-topic <a>s return True, Topic(elem['href'], elem['title']) else: - logging.debug('other = %s', elem) + log.debug('other = %s', elem) return False, elem def get_topics(self): @@ -232,8 +238,8 @@ class Group(Page): while target_stack: other = [] - BS = self._get_page_BS(target_stack.pop(0)) - for a_elem in BS.find_all('a'): + bs = self._get_page_bs(target_stack.pop(0)) + for a_elem in bs.find_all('a'): is_topic, res = self.get_one_topic(a_elem) # Ignore link in welcome message, e.g. django-oscar group is_welcomemsg = a_elem.get('target') == 'welcomeMsg' @@ -258,9 +264,10 @@ class Group(Page): jobs = [] with ThreadPoolExecutor(MAX_THREADS) as executor: for top in self.topics: - #print('[%d/%d] downloading "%s"' % (self.topics.index(top), + # print('[%d/%d] downloading "%s"' % (self.topics.index(top), # len_topics, top.name)) - print('[%d/%d] downloading' % (self.topics.index(top), len_topics)) + print('[%d/%d] downloading' % + (self.topics.index(top), len_topics)) job = executor.submit(top.get_articles) jobs.append(job) @@ -340,15 +347,15 @@ class MBOX(mailbox.mbox): self.close() -def main(group_URL): +def main(group_url): # Collect all messages to the internal variables if os.path.exists('group.yaml'): with open('group.yaml') as yf: - logging.debug('Loading state from group.yaml') + log.debug('Loading state from group.yaml') grp = yaml.load(yf) - logging.debug('Done') + log.debug('Done') else: - grp = Group(group_URL) + grp = Group(group_url) grp.collect_group() # dump the state for debugging @@ -366,7 +373,7 @@ def main(group_URL): def demangle(correct_list, orig_mbx, out_mbx): cnf_p = ConfigParser(dict_type=OrderedDict) cnf_p.read(correct_list) - #pairs = dict(cnf_p.items(ADDR_SEC_LABEL)) + # pairs = dict(cnf_p.items(ADDR_SEC_LABEL)) pairs = dict((k, {'repl': v, 'RE': re.compile(r'\b%s\b' % k, re.IGNORECASE)}) for (k, v) in cnf_p.items(ADDR_SEC_LABEL) @@ -386,9 +393,9 @@ def demangle(correct_list, orig_mbx, out_mbx): if matches is not None: u_from = msg.get_from() for orig, fixed in pairs.items(): - #if (orig is None) or (fixed is None): + # if (orig is None) or (fixed is None): # continue - #msg_str = msg_str.replace(orig, fixed) + # msg_str = msg_str.replace(orig, fixed) msg_str = fixed['RE'].sub(fixed['repl'], msg_str) counter += 1 # This is wrong out_msg = mailbox.mboxMessage(msg_str) @@ -403,8 +410,8 @@ def demangle(correct_list, orig_mbx, out_mbx): if __name__ == '__main__': - parser = argparse.ArgumentParser(description= - 'Scrape a Google Groups group.') + parser = argparse.ArgumentParser( + description='Scrape a Google Groups group.') parser.add_argument('group', metavar='URL', nargs='?', help='URL of the group') parser.add_argument('-d', '--demangle', metavar='DEMANGLE_FILE', nargs=3, @@ -413,7 +420,7 @@ if __name__ == '__main__': 'file.') args = parser.parse_args() - logging.debug('args = {0}'.format(args)) + log.debug('args = {0}'.format(args)) if args.demangle is not None: demangle(args.demangle[0], args.demangle[1], args.demangle[2]) diff --git a/test/test_unit.py b/test/test_unit.py index 25de312..eed09cc 100644 --- a/test/test_unit.py +++ b/test/test_unit.py @@ -17,7 +17,7 @@ EXP_URL = 'https://groups.google.com/forum/' + \ class TestGGScrapper(unittest.TestCase): def test_URL_conversion(self): - obs_URL = gg_scraper.Group.unenscape_Google_bang_URL(IN_URL) + obs_URL = gg_scraper.Group.unescape_google_bang_url(IN_URL) self.assertEqual(obs_URL, EXP_URL) |