diff options
author | Matěj Cepl <mcepl@redhat.com> | 2013-11-22 01:38:31 +0100 |
---|---|---|
committer | Matěj Cepl <mcepl@redhat.com> | 2013-11-22 01:48:37 +0100 |
commit | aefe5bd9372308437e4d81daf4317d4eb086fb1d (patch) | |
tree | da7719f595241407e4a1f5ffa4ff37713324ae12 | |
download | gg_scraper-aefe5bd9372308437e4d81daf4317d4eb086fb1d.tar.gz |
Start of the project.
Based on the unescpaing algorithm from
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
(on which I have been reminded by Sean Hogan (http://www.meekostuff.net/))
12 files changed, 375 insertions, 0 deletions
diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/0429b1c7-f69b-4c43-8545-8f063f570e31/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/0429b1c7-f69b-4c43-8545-8f063f570e31/values new file mode 100644 index 0000000..155b9f7 --- /dev/null +++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/0429b1c7-f69b-4c43-8545-8f063f570e31/values @@ -0,0 +1,50 @@ +{ + + + + + + + "creator": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "severity": "minor", + + + + + + + "status": "open", + + + + + + + "summary": "Collect all articles for a topic", + + + + + + + "time": "Fri, 22 Nov 2013 00:41:32 +0000" + + + + + + +} diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/1b2264d9-f5e3-4a60-82a7-f26b6bff1070/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/1b2264d9-f5e3-4a60-82a7-f26b6bff1070/values new file mode 100644 index 0000000..a20e39a --- /dev/null +++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/1b2264d9-f5e3-4a60-82a7-f26b6bff1070/values @@ -0,0 +1,50 @@ +{ + + + + + + + "creator": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "severity": "minor", + + + + + + + "status": "open", + + + + + + + "summary": "Write MBOX", + + + + + + + "time": "Fri, 22 Nov 2013 00:42:03 +0000" + + + + + + +} diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/52c9c6c0-8922-48b7-87a8-73cdc8434e32/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/52c9c6c0-8922-48b7-87a8-73cdc8434e32/values new file mode 100644 index 0000000..d119fe3 --- /dev/null +++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/52c9c6c0-8922-48b7-87a8-73cdc8434e32/values @@ -0,0 +1,50 @@ +{ + + + + + + + "creator": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "severity": "minor", + + + + + + + "status": "open", + + + + + + + "summary": "Collect all topics", + + + + + + + "time": "Fri, 22 Nov 2013 00:41:20 +0000" + + + + + + +} diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/88be1a53-84c7-470a-9a94-29cf753cfd62/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/88be1a53-84c7-470a-9a94-29cf753cfd62/values new file mode 100644 index 0000000..d7e53fc --- /dev/null +++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/88be1a53-84c7-470a-9a94-29cf753cfd62/values @@ -0,0 +1,50 @@ +{ + + + + + + + "creator": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "severity": "minor", + + + + + + + "status": "open", + + + + + + + "summary": "Parse an article", + + + + + + + "time": "Fri, 22 Nov 2013 00:41:41 +0000" + + + + + + +} diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/d8e67c6a-955f-45c1-a689-326f1cfa2b52/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/d8e67c6a-955f-45c1-a689-326f1cfa2b52/values new file mode 100644 index 0000000..7afd23c --- /dev/null +++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/d8e67c6a-955f-45c1-a689-326f1cfa2b52/values @@ -0,0 +1,50 @@ +{ + + + + + + + "creator": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "severity": "minor", + + + + + + + "status": "open", + + + + + + + "summary": "Convert an article to an email message", + + + + + + + "time": "Fri, 22 Nov 2013 00:41:57 +0000" + + + + + + +} diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/e66f428c-ee73-4e39-b073-5d6819075e13/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/e66f428c-ee73-4e39-b073-5d6819075e13/values new file mode 100644 index 0000000..4e84001 --- /dev/null +++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/e66f428c-ee73-4e39-b073-5d6819075e13/values @@ -0,0 +1,50 @@ +{ + + + + + + + "creator": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>", + + + + + + + "severity": "minor", + + + + + + + "status": "open", + + + + + + + "summary": "Collect all HREF links on a page", + + + + + + + "time": "Fri, 22 Nov 2013 00:48:31 +0000" + + + + + + +} diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/settings b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/settings new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/settings @@ -0,0 +1 @@ +{} diff --git a/.be/version b/.be/version new file mode 100644 index 0000000..38d39ae --- /dev/null +++ b/.be/version @@ -0,0 +1 @@ +Bugs Everywhere Directory v1.5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..55292ee --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.be/id-cache diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..e54e06c --- /dev/null +++ b/README.rst @@ -0,0 +1,2 @@ +A small script as a replacement of +http://saturnboy.com/2010/03/scraping-google-groups/ diff --git a/gg_scrapper.py b/gg_scrapper.py new file mode 100755 index 0000000..59b81bc --- /dev/null +++ b/gg_scrapper.py @@ -0,0 +1,50 @@ +#!/usr/bin/python + +import urllib2 +from bs4 import BeautifulSoup +import logging +logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', + level=logging.DEBUG) + + +class GooglePage: + verb_handler = urllib2.HTTPHandler() + if logging.getLogger().getEffectiveLevel() == logging.DEBUG: + verb_handler.set_http_debuglevel(2) + redir_handler = urllib2.HTTPRedirectHandler() + opener = urllib2.build_opener(verb_handler, redir_handler) + + def __init__(self, URL): + self.bs_page = self.get_first_page_BS(URL) + + @staticmethod + def unenscape_Google_bang_URL(old_URL): + """ + See https://developers.google.com/webmasters\ + /ajax-crawling/docs/getting-started for more information + """ + if old_URL.find('#!') != -1: + esc_URL = old_URL.replace('#!', '?_escaped_fragment_=') + logging.debug('esc_URL = {}'.format(esc_URL)) + return esc_URL + else: + return old_URL + + @classmethod + def do_redirect(cls, URL): + res = cls.opener.open(URL) + + if res.getcode() == 200: + new_URL = res.geturl() + logging.debug('url = {}'.format(new_URL)) + return cls.unenscape_Google_bang_URL(new_URL) + else: + raise urllib2.HTTPError('Unknown URL: {}'.format(URL)) + + def get_first_page_BS(self, URL): + with self.opener.open(self.do_redirect(URL)) as esc_res: + return BeautifulSoup(esc_res.read()) + + def get_topics(self, BS): + 'Recursively[?] get all topic (as special objects)' + return [] diff --git a/test_gg_scrapper.py b/test_gg_scrapper.py new file mode 100644 index 0000000..079c3c1 --- /dev/null +++ b/test_gg_scrapper.py @@ -0,0 +1,20 @@ +import unittest +import gg_scrapper + +ORIG_URL = 'http://groups.google.com/d/forum/jbrout' +EXP_URL = 'https://groups.google.com/forum/' + \ + '?_escaped_fragment_=forum/jbrout' + + +class TestGGScrapper(unittest.TestCase): + def test_URL_conversion(self): + in_URL = 'https://groups.google.com/forum/#!forum/jbrout' + obs_URL = gg_scrapper.GooglePage.unenscape_Google_bang_URL(in_URL) + self.assertEqual(obs_URL, EXP_URL) + + def test_do_redirect(self): + obs_URL = gg_scrapper.GooglePage.do_redirect(ORIG_URL) + self.assertEqual(obs_URL, EXP_URL) + +if __name__ == '__main__': + unittest.main() |