scrapper -> scraper0.5

Woops! scrapper: a fighter or aggressive competitor, especially one always ready or eager for a fight, argument, or contest: the best lightweight scrapper in boxing; a rugged political scrapper. That's not what I meant.
author: Matěj Cepl <mcepl@redhat.com> 2014-01-05 09:25:14 +0100
committer: Matěj Cepl <mcepl@redhat.com> 2014-01-11 10:50:25 +0100
commit: c5c5b68dd2ed591e0ad411bf3dde4611cb39f18c (patch)
tree: 533014d5bc07f3a5432800d059dd8c33fe446b2e /gg_scraper.py
parent: 2b7981e4f8ad425d73936751789c7609f5541a1b (diff)
download: gg_scraper-c5c5b68dd2ed591e0ad411bf3dde4611cb39f18c.tar.gz
1 files changed, 340 insertions, 0 deletions
diff --git a/gg_scraper.py b/gg_scraper.py
new file mode 100755
index 0000000..209b92c
--- /dev/null
+++ b/gg_scraper.py
@@ -0,0 +1,340 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+Download a Google Group to MBOX
+Copyright (C) 2014 Matěj Cepl
+
+This program is free software: you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation, either version 3 of the License, or (at your
+option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.'
+"""
+from __future__ import absolute_import, print_function, unicode_literals
+
+import argparse
+try:
+    from configparser import ConfigParser
+except ImportError:
+    from ConfigParser import ConfigParser
+import mailbox
+import os.path
+import re
+import shutil
+import subprocess
+import sys
+try:
+    from urllib.error import HTTPError
+    from urllib.request import HTTPHandler, HTTPRedirectHandler, \
+        build_opener
+except ImportError:
+    from urllib2 import (HTTPError, HTTPHandler, HTTPRedirectHandler,
+                         build_opener)
+#from concurrent.futures import ProcessPoolExecutor
+from bs4 import BeautifulSoup
+import logging
+logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
+                    level=logging.INFO)
+
+ADDR_SEC_LABEL = 'addresses'
+MANGLED_ADDR_RE = re.compile(
+    r'([a-zA-Z0-9_.+-]+\.\.\.@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',
+    re.IGNORECASE)
+
+__version__ = '0.5'
+
+
+class Page(object):
+    verb_handler = HTTPHandler()
+    if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
+        verb_handler.set_http_debuglevel(2)
+    redir_handler = HTTPRedirectHandler()
+    opener = build_opener(verb_handler, redir_handler)
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def unenscape_Google_bang_URL(old_URL):
+        """
+        See https://developers.google.com/webmasters\
+                /ajax-crawling/docs/getting-started for more information
+        """
+        if old_URL.find('#!') != -1:
+            esc_URL = old_URL.replace('#!', '?_escaped_fragment_=')
+            return esc_URL
+        else:
+            return old_URL
+
+    @classmethod
+    def do_redirect(cls, URL):
+        res = cls.opener.open(URL)
+
+        if res.getcode() == 200:
+            new_URL = res.geturl()
+            return cls.unenscape_Google_bang_URL(new_URL)
+        else:
+            raise HTTPError('Unknown URL: {}'.format(URL))
+
+    def _get_page_BS(self, URL):
+        res = self.opener.open(self.do_redirect(URL))
+        in_str = res.read()
+        bs = BeautifulSoup(in_str)
+        res.close()
+        return bs
+
+
+class Article(Page):
+    def __init__(self, URL):
+        super(Article, self).__init__()
+        self.root = URL.replace('d/msg/', 'forum/message/raw?msg=')
+        self.raw_message = ''
+
+    def collect_message(self):
+        logging.debug('self.root = {}'.format(self.root))
+        try:
+            res = self.opener.open(self.root)
+            raw_msg = res.read()
+            proc = subprocess.Popen(['/usr/bin/formail'],
+                                    stdin=subprocess.PIPE,
+                                    stdout=subprocess.PIPE,
+                                    universal_newlines=True)
+            result = proc.communicate(raw_msg.decode())[0]
+        except HTTPError as exc:
+            logging.warning('Exception on downloading {}:\n{}'.format(
+                self.root, exc))
+        finally:
+            res.close()
+
+        return result
+
+
+class Topic(Page):
+    def __init__(self, URL, name):
+        super(Topic, self).__init__()
+        self.name = name
+        self.root = self.do_redirect(URL)
+        self.articles = []
+
+    def __unicode__(self):
+        return "%s: %s" % (self.root, self.name)
+
+    @staticmethod
+    def get_one_article(elem):
+        return elem
+
+    def get_count_articles(self):
+        '''Get total number of articles from the number on the page
+        itself.
+        '''
+        BS = self._get_page_BS(self.root)
+        i_elem = BS.find_all('i')
+        if len(i_elem) <= 0:
+            raise ValueError('Cannot find count of topics!')
+
+        i_str = i_elem[0].string
+        return int(re.match(r'\D+ \d+\D+\d+ \D+ (\d+) \D+$', i_str).group(1))
+
+    def get_articles(self):
+        out = []
+        page = self._get_page_BS(self.root)
+        for a_elem in page.find_all('a'):
+            if 'href' in a_elem.attrs:
+                a_href = a_elem['href']
+                if re.match(r'https://groups.google.com/d/msg/',
+                            a_href) is not None:
+                    out.append(Article(a_href))
+
+        return out
+
+
+class Group(Page):
+    def __init__(self, URL):
+        super(Group, self).__init__()
+        self.group_URL = URL
+        self.topics = []
+        match = re.match(r'https://groups.google.com/forum/#!forum/(.+)',
+                         URL)
+        if match is not None:
+            self.name = match.group(1)
+
+    @staticmethod
+    def get_count_topics(BS):
+        '''Get total number of topics from the number on the page
+        itself.
+
+        Which would be awesome for control, except it is wrong on all
+        pages in various and different ways. :(
+        '''
+        i_elem = BS.find_all('i')
+        if len(i_elem) <= 0:
+            raise ValueError('Cannot find count of topics!')
+
+        i_str = i_elem[0].string
+        return int(re.match(r'\D+ \d+ - \d+ \D+ (\d+) \D+$', i_str).group(1))
+
+    @staticmethod
+    def get_one_topic(elem):
+        sys.stdout.write('. ')
+        sys.stdout.flush()
+        if 'title' in elem.attrs:
+            # filter out all-non-topic <a>s
+            return True, Topic(elem['href'], elem['title'])
+        else:
+            logging.debug('other = %s', elem)
+            return False, elem
+
+    def get_topics(self):
+        '''Recursively[?] get all topic (as special objects)
+        Also return (for error checking) number of topics from the head
+        of the topic page.
+        '''
+        out = []
+        other = []
+        BS = self._get_page_BS(self.group_URL)
+        for a_elem in BS.find_all('a'):
+            is_topic, res = self.get_one_topic(a_elem)
+            if is_topic:
+                out.append(res)
+            else:
+                other.append(res)
+
+        if len(other) == 1:
+            new_bs = Group(other[0]['href'])
+            out.extend(new_bs.get_topics())
+        elif len(other) != 0:
+            raise ValueError(
+                'There must be either one or none link to the next page!')
+
+        sys.stdout.write('\n')
+        sys.stdout.flush()
+        return out
+
+    def collect_group(self):
+        self.topics = self.get_topics()
+        len_topics = len(self.topics)
+        for top in self.topics:
+            print('[%d/%d] downloading "%s"' % (self.topics.index(top),
+                  len_topics, top.name))
+            arts = top.get_articles()
+            top.articles = arts
+            for a in arts:
+                msg = a.collect_message()
+                if msg is not None:
+                    a.raw_message = msg
+
+    def all_messages(self):
+        '''Iterate over all messages in the group'''
+        for top in self.topics:
+            for art in top.articles:
+                yield art.raw_message
+
+    def collect_mangled_addrs(self):
+        addrs = set()
+        for top in self.topics:
+            for art in top.articles:
+                msg_str = art.raw_message
+                # see http://stackoverflow.com/questions/201323
+                msg_matches = MANGLED_ADDR_RE.findall(msg_str)
+                if msg_matches is not None:
+                    for mtch in msg_matches:
+                        addrs.add(mtch)
+
+        addrs = sorted(list(addrs))
+
+        with open('{}.cnf'.format(self.name), 'w') as cnf_f:
+            cnf_p = ConfigParser()
+            cnf_p.add_section(ADDR_SEC_LABEL)
+            for addr in addrs:
+                cnf_p.set(ADDR_SEC_LABEL, addr, '')
+            cnf_p.write(cnf_f)
+
+
+class MBOX(mailbox.mbox):
+    def __init__(self, filename):
+        if os.path.exists(filename):
+            shutil.move(filename, '{0}.bak'.format(filename))
+        mailbox.mbox.__init__(self, filename)
+        self.box_name = filename
+
+    def write_group(self, group_object):
+        self.lock()
+        for mbx_str in group_object.all_messages():
+            self.add(mbx_str.encode())
+        self.unlock()
+        self.close()
+
+
+def main(group_URL):
+    # Collect all messages to the internal variables
+    grp = Group(group_URL)
+    grp.collect_group()
+
+    #import yaml
+    # dump the state for debugging
+    #with open('group.yaml', 'w') as yf:
+    #    yaml.dump(grp, yf)
+
+    # Write MBOX
+    mbx = MBOX("{}.mbx".format(grp.name))
+    mbx.write_group(grp)
+
+    # generate list of addresses protected against spammers
+    grp.collect_mangled_addrs()
+
+
+def demangle(correct_list, orig_mbx, out_mbx):
+    cnf_p = ConfigParser()
+    cnf_p.read(correct_list)
+    pairs = dict(cnf_p.items(ADDR_SEC_LABEL))
+
+    if os.path.exists(out_mbx):
+        shutil.move(out_mbx, '{}.bak'.format(out_mbx))
+
+    in_mbx = mailbox.mbox(orig_mbx)
+    out_mbx = mailbox.mbox(out_mbx)
+
+    out_mbx.lock()
+    for msg in in_mbx.itervalues():
+        msg_str = str(msg)
+        matches = MANGLED_ADDR_RE.search(msg_str)
+        if matches is not None:
+            u_from = msg.get_from()
+            for orig, fixed in pairs.items():
+                if (orig is not None) and (fixed is not None):
+                    continue
+                msg_str = msg_str.replace(orig, fixed)
+            out_msg = mailbox.mboxMessage(msg_str)
+            out_msg.set_from(u_from)
+
+            out_mbx.add(out_msg)
+        else:
+            out_mbx.add(msg)
+    out_mbx.close()
+    in_mbx.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=
+                                     'Scrape a Google Groups group.')
+    parser.add_argument('group', metavar='URL', nargs='?',
+                        help='URL of the group')
+    parser.add_argument('-d', '--demangle', metavar='DEMANGLE_FILE', nargs=3,
+                        help='Demangle mbox from stdin to stdout' +
+                        'according to the values in the configuration' +
+                        'file.')
+    args = parser.parse_args()
+
+    logging.debug('args = {}'.format(args))
+
+    if args.demangle is not None:
+        demangle(args.demangle[0], args.demangle[1], args.demangle[2])
+    else:
+        main(args.group)
author	Matěj Cepl <mcepl@redhat.com>	2014-01-05 09:25:14 +0100
committer	Matěj Cepl <mcepl@redhat.com>	2014-01-11 10:50:25 +0100
commit	c5c5b68dd2ed591e0ad411bf3dde4611cb39f18c (patch)
tree	533014d5bc07f3a5432800d059dd8c33fe446b2e /gg_scraper.py
parent	2b7981e4f8ad425d73936751789c7609f5541a1b (diff)
download	gg_scraper-c5c5b68dd2ed591e0ad411bf3dde4611cb39f18c.tar.gz