aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xgg_scraper.py99
-rw-r--r--test/test_unit.py2
2 files changed, 54 insertions, 47 deletions
diff --git a/gg_scraper.py b/gg_scraper.py
index 50c437a..2c24082 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -24,17 +24,19 @@ try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
-import operator
try:
from configparser import ConfigParser
except ImportError:
from ConfigParser import ConfigParser
+import logging
import mailbox
+import operator
import os.path
import re
import shutil
import subprocess
import sys
+
import yaml
try:
from urllib.error import HTTPError
@@ -43,13 +45,14 @@ try:
except ImportError:
from urllib2 import (HTTPError, HTTPHandler, HTTPRedirectHandler,
build_opener)
+
+from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
try:
from queue import Queue
except ImportError:
- from Queue import Queue
-from bs4 import BeautifulSoup
-import logging
+ from Queue import Queue # noqa
+
logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
level=logging.DEBUG)
@@ -65,6 +68,8 @@ pyver = sys.version_info
py26 = pyver[:2] < (2, 7)
py3k = pyver[0] == 3
+log = logging.getLogger('gg_scraper')
+
class BadURLError(ValueError):
pass
@@ -81,28 +86,28 @@ class Page(object):
pass
@staticmethod
- def unenscape_Google_bang_URL(old_URL):
+ def unescape_google_bang_url(old_url):
"""
See https://developers.google.com/webmasters\
/ajax-crawling/docs/getting-started for more information
"""
- if old_URL.find('#!') != -1:
- return old_URL.replace('#!', '?_escaped_fragment_=')
- elif old_URL.startswith('https://groups.google.com/d/topic/'):
+ if old_url.find('#!') != -1:
+ return old_url.replace('#!', '?_escaped_fragment_=')
+ elif old_url.startswith('https://groups.google.com/d/topic/'):
# DEBUG:get_one_topic:URL collected =
# https://groups.google.com/d/topic/jbrout/dreCkob3KSs
# DEBUG:__init__:root_URL =
# https://groups.google.com/forum/\
# ?_escaped_fragment_=topic/jbrout/dreCkob3KSs
- return old_URL.replace(
+ return old_url.replace(
'https://groups.google.com/d/',
'https://groups.google.com/forum/?_escaped_fragment_='
)
else:
- return old_URL
+ return old_url
- def _get_page_BS(self, URL):
- res = self.opener.open(self.unenscape_Google_bang_URL(URL))
+ def _get_page_bs(self, url):
+ res = self.opener.open(self.unescape_google_bang_url(url))
in_str = res.read()
bs = BeautifulSoup(in_str)
res.close()
@@ -110,13 +115,13 @@ class Page(object):
class Article(Page):
- def __init__(self, URL):
+ def __init__(self, url):
super(Article, self).__init__()
- self.root = URL.replace('d/msg/', 'forum/message/raw?msg=')
+ self.root = url.replace('d/msg/', 'forum/message/raw?msg=')
self.raw_message = ''
def collect_message(self):
- logging.debug('self.root = {0}'.format(self.root))
+ log.debug('self.root = {0}'.format(self.root))
result = None
try:
res = self.opener.open(self.root)
@@ -127,7 +132,6 @@ class Article(Page):
proc = subprocess.Popen(['/usr/bin/formail'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
- #universal_newlines=True)
if not(py3k and isinstance(raw_msg, bytes)):
raw_msg = raw_msg.encode('utf8')
result = proc.communicate(raw_msg)[0]
@@ -142,11 +146,11 @@ class Article(Page):
class Topic(Page):
- def __init__(self, URL, name):
+ def __init__(self, url, name):
super(Topic, self).__init__()
self.name = name
- root_URL = self.unenscape_Google_bang_URL(URL)
- self.root = root_URL
+ root_url = self.unescape_google_bang_url(url)
+ self.root = root_url
self.articles = []
def __unicode__(self):
@@ -160,8 +164,8 @@ class Topic(Page):
'''Get total number of articles from the number on the page
itself.
'''
- BS = self._get_page_BS(self.root)
- i_elem = BS.find_all('i')
+ bs = self._get_page_bs(self.root)
+ i_elem = bs.find_all('i')
if len(i_elem) <= 0:
raise ValueError('Cannot find count of topics!')
@@ -170,7 +174,7 @@ class Topic(Page):
def get_articles(self):
out = []
- page = self._get_page_BS(self.root)
+ page = self._get_page_bs(self.root)
for a_elem in page.find_all('a'):
if 'href' in a_elem.attrs:
a_href = a_elem['href']
@@ -182,28 +186,30 @@ class Topic(Page):
class Group(Page):
- GOOD_URL_RE = re.compile(r'https://groups.google.com/forum/#!forum/(.+)')
+ good_url_re = re.compile(r'https://groups.google.com/forum/#!forum/(.+)')
- def __init__(self, URL):
+ def __init__(self, url):
super(Group, self).__init__()
- self.group_URL = URL
+ self.group_URL = url
self.topics = []
- match = self.GOOD_URL_RE.match(URL)
- logging.debug('match = %s', match)
+ match = self.good_url_re.match(url)
+ log.debug('match = %s', match)
if match is None:
- raise BadURLError("Required URL in form 'https://groups.google.com/forum/#!forum/GROUPNAME'")
+ raise BadURLError(
+ "Required url in form " +
+ "'https://groups.google.com/forum/#!forum/GROUPNAME'")
self.name = match.group(1)
@staticmethod
- def get_count_topics(BS):
+ def get_count_topics(bs):
'''Get total number of topics from the number on the page
itself.
Which would be awesome for control, except it is wrong on all
pages in various and different ways. :(
'''
- i_elem = BS.find_all('i')
+ i_elem = bs.find_all('i')
if len(i_elem) <= 0:
raise ValueError('Cannot find count of topics!')
@@ -214,12 +220,12 @@ class Group(Page):
def get_one_topic(elem):
sys.stdout.write('. ')
sys.stdout.flush()
- #logging.debug('URL collected = {0}'.format(elem['href']))
+ # log.debug('URL collected = {0}'.format(elem['href']))
if 'title' in elem.attrs:
# filter out all-non-topic <a>s
return True, Topic(elem['href'], elem['title'])
else:
- logging.debug('other = %s', elem)
+ log.debug('other = %s', elem)
return False, elem
def get_topics(self):
@@ -232,8 +238,8 @@ class Group(Page):
while target_stack:
other = []
- BS = self._get_page_BS(target_stack.pop(0))
- for a_elem in BS.find_all('a'):
+ bs = self._get_page_bs(target_stack.pop(0))
+ for a_elem in bs.find_all('a'):
is_topic, res = self.get_one_topic(a_elem)
# Ignore link in welcome message, e.g. django-oscar group
is_welcomemsg = a_elem.get('target') == 'welcomeMsg'
@@ -258,9 +264,10 @@ class Group(Page):
jobs = []
with ThreadPoolExecutor(MAX_THREADS) as executor:
for top in self.topics:
- #print('[%d/%d] downloading "%s"' % (self.topics.index(top),
+ # print('[%d/%d] downloading "%s"' % (self.topics.index(top),
# len_topics, top.name))
- print('[%d/%d] downloading' % (self.topics.index(top), len_topics))
+ print('[%d/%d] downloading' %
+ (self.topics.index(top), len_topics))
job = executor.submit(top.get_articles)
jobs.append(job)
@@ -340,15 +347,15 @@ class MBOX(mailbox.mbox):
self.close()
-def main(group_URL):
+def main(group_url):
# Collect all messages to the internal variables
if os.path.exists('group.yaml'):
with open('group.yaml') as yf:
- logging.debug('Loading state from group.yaml')
+ log.debug('Loading state from group.yaml')
grp = yaml.load(yf)
- logging.debug('Done')
+ log.debug('Done')
else:
- grp = Group(group_URL)
+ grp = Group(group_url)
grp.collect_group()
# dump the state for debugging
@@ -366,7 +373,7 @@ def main(group_URL):
def demangle(correct_list, orig_mbx, out_mbx):
cnf_p = ConfigParser(dict_type=OrderedDict)
cnf_p.read(correct_list)
- #pairs = dict(cnf_p.items(ADDR_SEC_LABEL))
+ # pairs = dict(cnf_p.items(ADDR_SEC_LABEL))
pairs = dict((k, {'repl': v, 'RE': re.compile(r'\b%s\b' % k,
re.IGNORECASE)})
for (k, v) in cnf_p.items(ADDR_SEC_LABEL)
@@ -386,9 +393,9 @@ def demangle(correct_list, orig_mbx, out_mbx):
if matches is not None:
u_from = msg.get_from()
for orig, fixed in pairs.items():
- #if (orig is None) or (fixed is None):
+ # if (orig is None) or (fixed is None):
# continue
- #msg_str = msg_str.replace(orig, fixed)
+ # msg_str = msg_str.replace(orig, fixed)
msg_str = fixed['RE'].sub(fixed['repl'], msg_str)
counter += 1 # This is wrong
out_msg = mailbox.mboxMessage(msg_str)
@@ -403,8 +410,8 @@ def demangle(correct_list, orig_mbx, out_mbx):
if __name__ == '__main__':
- parser = argparse.ArgumentParser(description=
- 'Scrape a Google Groups group.')
+ parser = argparse.ArgumentParser(
+ description='Scrape a Google Groups group.')
parser.add_argument('group', metavar='URL', nargs='?',
help='URL of the group')
parser.add_argument('-d', '--demangle', metavar='DEMANGLE_FILE', nargs=3,
@@ -413,7 +420,7 @@ if __name__ == '__main__':
'file.')
args = parser.parse_args()
- logging.debug('args = {0}'.format(args))
+ log.debug('args = {0}'.format(args))
if args.demangle is not None:
demangle(args.demangle[0], args.demangle[1], args.demangle[2])
diff --git a/test/test_unit.py b/test/test_unit.py
index 25de312..eed09cc 100644
--- a/test/test_unit.py
+++ b/test/test_unit.py
@@ -17,7 +17,7 @@ EXP_URL = 'https://groups.google.com/forum/' + \
class TestGGScrapper(unittest.TestCase):
def test_URL_conversion(self):
- obs_URL = gg_scraper.Group.unenscape_Google_bang_URL(IN_URL)
+ obs_URL = gg_scraper.Group.unescape_google_bang_url(IN_URL)
self.assertEqual(obs_URL, EXP_URL)