PEP8ization

author: Matěj Cepl <mcepl@cepl.eu> 2016-04-16 13:51:53 +0200
committer: Matěj Cepl <mcepl@cepl.eu> 2016-04-16 13:51:53 +0200
commit: 00b48339eedd402eeb95e5e4f718529b6b0168a6 (patch)
tree: ff2efd66f78826697230a4af179893c0f059bed9 /gg_scraper.py
parent: 5d876420a95f3cae5d38c7e7ffd82aa460194c3e (diff)
download: gg_scraper-00b48339eedd402eeb95e5e4f718529b6b0168a6.tar.gz
1 files changed, 53 insertions, 46 deletions
diff --git a/gg_scraper.py b/gg_scraper.py
index 50c437a..2c24082 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -24,17 +24,19 @@ try:
     from collections import OrderedDict
 except ImportError:
     from ordereddict import OrderedDict
-import operator
 try:
     from configparser import ConfigParser
 except ImportError:
     from ConfigParser import ConfigParser
+import logging
 import mailbox
+import operator
 import os.path
 import re
 import shutil
 import subprocess
 import sys
+
 import yaml
 try:
     from urllib.error import HTTPError
@@ -43,13 +45,14 @@ try:
 except ImportError:
     from urllib2 import (HTTPError, HTTPHandler, HTTPRedirectHandler,
                          build_opener)
+
+from bs4 import BeautifulSoup
 from concurrent.futures import ThreadPoolExecutor, as_completed
 try:
     from queue import Queue
 except ImportError:
-    from Queue import Queue
-from bs4 import BeautifulSoup
-import logging
+    from Queue import Queue  # noqa
+
 logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
                     level=logging.DEBUG)
 
@@ -65,6 +68,8 @@ pyver = sys.version_info
 py26 = pyver[:2] < (2, 7)
 py3k = pyver[0] == 3
 
+log = logging.getLogger('gg_scraper')
+
 
 class BadURLError(ValueError):
     pass
@@ -81,28 +86,28 @@ class Page(object):
         pass
 
     @staticmethod
-    def unenscape_Google_bang_URL(old_URL):
+    def unescape_google_bang_url(old_url):
         """
         See https://developers.google.com/webmasters\
                 /ajax-crawling/docs/getting-started for more information
         """
-        if old_URL.find('#!') != -1:
-            return old_URL.replace('#!', '?_escaped_fragment_=')
-        elif old_URL.startswith('https://groups.google.com/d/topic/'):
+        if old_url.find('#!') != -1:
+            return old_url.replace('#!', '?_escaped_fragment_=')
+        elif old_url.startswith('https://groups.google.com/d/topic/'):
             # DEBUG:get_one_topic:URL collected =
             #     https://groups.google.com/d/topic/jbrout/dreCkob3KSs
             # DEBUG:__init__:root_URL =
             #     https://groups.google.com/forum/\
             #        ?_escaped_fragment_=topic/jbrout/dreCkob3KSs
-            return old_URL.replace(
+            return old_url.replace(
                 'https://groups.google.com/d/',
                 'https://groups.google.com/forum/?_escaped_fragment_='
             )
         else:
-            return old_URL
+            return old_url
 
-    def _get_page_BS(self, URL):
-        res = self.opener.open(self.unenscape_Google_bang_URL(URL))
+    def _get_page_bs(self, url):
+        res = self.opener.open(self.unescape_google_bang_url(url))
         in_str = res.read()
         bs = BeautifulSoup(in_str)
         res.close()
@@ -110,13 +115,13 @@ class Page(object):
 
 
 class Article(Page):
-    def __init__(self, URL):
+    def __init__(self, url):
         super(Article, self).__init__()
-        self.root = URL.replace('d/msg/', 'forum/message/raw?msg=')
+        self.root = url.replace('d/msg/', 'forum/message/raw?msg=')
         self.raw_message = ''
 
     def collect_message(self):
-        logging.debug('self.root = {0}'.format(self.root))
+        log.debug('self.root = {0}'.format(self.root))
         result = None
         try:
             res = self.opener.open(self.root)
@@ -127,7 +132,6 @@ class Article(Page):
             proc = subprocess.Popen(['/usr/bin/formail'],
                                     stdin=subprocess.PIPE,
                                     stdout=subprocess.PIPE)
-                                    #universal_newlines=True)
             if not(py3k and isinstance(raw_msg, bytes)):
                 raw_msg = raw_msg.encode('utf8')
             result = proc.communicate(raw_msg)[0]
@@ -142,11 +146,11 @@ class Article(Page):
 
 
 class Topic(Page):
-    def __init__(self, URL, name):
+    def __init__(self, url, name):
         super(Topic, self).__init__()
         self.name = name
-        root_URL = self.unenscape_Google_bang_URL(URL)
-        self.root = root_URL
+        root_url = self.unescape_google_bang_url(url)
+        self.root = root_url
         self.articles = []
 
     def __unicode__(self):
@@ -160,8 +164,8 @@ class Topic(Page):
         '''Get total number of articles from the number on the page
         itself.
         '''
-        BS = self._get_page_BS(self.root)
-        i_elem = BS.find_all('i')
+        bs = self._get_page_bs(self.root)
+        i_elem = bs.find_all('i')
         if len(i_elem) <= 0:
             raise ValueError('Cannot find count of topics!')
 
@@ -170,7 +174,7 @@ class Topic(Page):
 
     def get_articles(self):
         out = []
-        page = self._get_page_BS(self.root)
+        page = self._get_page_bs(self.root)
         for a_elem in page.find_all('a'):
             if 'href' in a_elem.attrs:
                 a_href = a_elem['href']
@@ -182,28 +186,30 @@ class Topic(Page):
 
 
 class Group(Page):
-    GOOD_URL_RE = re.compile(r'https://groups.google.com/forum/#!forum/(.+)')
+    good_url_re = re.compile(r'https://groups.google.com/forum/#!forum/(.+)')
 
-    def __init__(self, URL):
+    def __init__(self, url):
         super(Group, self).__init__()
-        self.group_URL = URL
+        self.group_URL = url
         self.topics = []
-        match = self.GOOD_URL_RE.match(URL)
-        logging.debug('match = %s', match)
+        match = self.good_url_re.match(url)
+        log.debug('match = %s', match)
         if match is None:
-            raise BadURLError("Required URL in form 'https://groups.google.com/forum/#!forum/GROUPNAME'")
+            raise BadURLError(
+                "Required url in form " +
+                "'https://groups.google.com/forum/#!forum/GROUPNAME'")
 
         self.name = match.group(1)
 
     @staticmethod
-    def get_count_topics(BS):
+    def get_count_topics(bs):
         '''Get total number of topics from the number on the page
         itself.
 
         Which would be awesome for control, except it is wrong on all
         pages in various and different ways. :(
         '''
-        i_elem = BS.find_all('i')
+        i_elem = bs.find_all('i')
         if len(i_elem) <= 0:
             raise ValueError('Cannot find count of topics!')
 
@@ -214,12 +220,12 @@ class Group(Page):
     def get_one_topic(elem):
         sys.stdout.write('. ')
         sys.stdout.flush()
-        #logging.debug('URL collected = {0}'.format(elem['href']))
+        # log.debug('URL collected = {0}'.format(elem['href']))
         if 'title' in elem.attrs:
             # filter out all-non-topic <a>s
             return True, Topic(elem['href'], elem['title'])
         else:
-            logging.debug('other = %s', elem)
+            log.debug('other = %s', elem)
             return False, elem
 
     def get_topics(self):
@@ -232,8 +238,8 @@ class Group(Page):
 
         while target_stack:
             other = []
-            BS = self._get_page_BS(target_stack.pop(0))
-            for a_elem in BS.find_all('a'):
+            bs = self._get_page_bs(target_stack.pop(0))
+            for a_elem in bs.find_all('a'):
                 is_topic, res = self.get_one_topic(a_elem)
                 # Ignore link in welcome message, e.g. django-oscar group
                 is_welcomemsg = a_elem.get('target') == 'welcomeMsg'
@@ -258,9 +264,10 @@ class Group(Page):
         jobs = []
         with ThreadPoolExecutor(MAX_THREADS) as executor:
             for top in self.topics:
-                #print('[%d/%d] downloading "%s"' % (self.topics.index(top),
+                # print('[%d/%d] downloading "%s"' % (self.topics.index(top),
                 #      len_topics, top.name))
-                print('[%d/%d] downloading' % (self.topics.index(top), len_topics))
+                print('[%d/%d] downloading' %
+                      (self.topics.index(top), len_topics))
                 job = executor.submit(top.get_articles)
                 jobs.append(job)
 
@@ -340,15 +347,15 @@ class MBOX(mailbox.mbox):
         self.close()
 
 
-def main(group_URL):
+def main(group_url):
     # Collect all messages to the internal variables
     if os.path.exists('group.yaml'):
         with open('group.yaml') as yf:
-            logging.debug('Loading state from group.yaml')
+            log.debug('Loading state from group.yaml')
             grp = yaml.load(yf)
-            logging.debug('Done')
+            log.debug('Done')
     else:
-        grp = Group(group_URL)
+        grp = Group(group_url)
         grp.collect_group()
 
         # dump the state for debugging
@@ -366,7 +373,7 @@ def main(group_URL):
 def demangle(correct_list, orig_mbx, out_mbx):
     cnf_p = ConfigParser(dict_type=OrderedDict)
     cnf_p.read(correct_list)
-    #pairs = dict(cnf_p.items(ADDR_SEC_LABEL))
+    # pairs = dict(cnf_p.items(ADDR_SEC_LABEL))
     pairs = dict((k, {'repl': v, 'RE': re.compile(r'\b%s\b' % k,
                                                   re.IGNORECASE)})
                  for (k, v) in cnf_p.items(ADDR_SEC_LABEL)
@@ -386,9 +393,9 @@ def demangle(correct_list, orig_mbx, out_mbx):
         if matches is not None:
             u_from = msg.get_from()
             for orig, fixed in pairs.items():
-                #if (orig is None) or (fixed is None):
+                # if (orig is None) or (fixed is None):
                 #    continue
-                #msg_str = msg_str.replace(orig, fixed)
+                # msg_str = msg_str.replace(orig, fixed)
                 msg_str = fixed['RE'].sub(fixed['repl'], msg_str)
                 counter += 1  # This is wrong
             out_msg = mailbox.mboxMessage(msg_str)
@@ -403,8 +410,8 @@ def demangle(correct_list, orig_mbx, out_mbx):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description=
-                                     'Scrape a Google Groups group.')
+    parser = argparse.ArgumentParser(
+        description='Scrape a Google Groups group.')
     parser.add_argument('group', metavar='URL', nargs='?',
                         help='URL of the group')
     parser.add_argument('-d', '--demangle', metavar='DEMANGLE_FILE', nargs=3,
@@ -413,7 +420,7 @@ if __name__ == '__main__':
                         'file.')
     args = parser.parse_args()
 
-    logging.debug('args = {0}'.format(args))
+    log.debug('args = {0}'.format(args))
 
     if args.demangle is not None:
         demangle(args.demangle[0], args.demangle[1], args.demangle[2])
author	Matěj Cepl <mcepl@cepl.eu>	2016-04-16 13:51:53 +0200
committer	Matěj Cepl <mcepl@cepl.eu>	2016-04-16 13:51:53 +0200
commit	00b48339eedd402eeb95e5e4f718529b6b0168a6 (patch)
tree	ff2efd66f78826697230a4af179893c0f059bed9 /gg_scraper.py
parent	5d876420a95f3cae5d38c7e7ffd82aa460194c3e (diff)
download	gg_scraper-00b48339eedd402eeb95e5e4f718529b6b0168a6.tar.gz