aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2013-11-22 01:38:31 +0100
committerMatěj Cepl <mcepl@redhat.com>2013-11-22 01:48:37 +0100
commitaefe5bd9372308437e4d81daf4317d4eb086fb1d (patch)
treeda7719f595241407e4a1f5ffa4ff37713324ae12
downloadgg_scraper-aefe5bd9372308437e4d81daf4317d4eb086fb1d.tar.gz
Start of the project.
Based on the unescpaing algorithm from https://developers.google.com/webmasters/ajax-crawling/docs/getting-started (on which I have been reminded by Sean Hogan (http://www.meekostuff.net/))
-rw-r--r--.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/0429b1c7-f69b-4c43-8545-8f063f570e31/values50
-rw-r--r--.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/1b2264d9-f5e3-4a60-82a7-f26b6bff1070/values50
-rw-r--r--.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/52c9c6c0-8922-48b7-87a8-73cdc8434e32/values50
-rw-r--r--.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/88be1a53-84c7-470a-9a94-29cf753cfd62/values50
-rw-r--r--.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/d8e67c6a-955f-45c1-a689-326f1cfa2b52/values50
-rw-r--r--.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/e66f428c-ee73-4e39-b073-5d6819075e13/values50
-rw-r--r--.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/settings1
-rw-r--r--.be/version1
-rw-r--r--.gitignore1
-rw-r--r--README.rst2
-rwxr-xr-xgg_scrapper.py50
-rw-r--r--test_gg_scrapper.py20
12 files changed, 375 insertions, 0 deletions
diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/0429b1c7-f69b-4c43-8545-8f063f570e31/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/0429b1c7-f69b-4c43-8545-8f063f570e31/values
new file mode 100644
index 0000000..155b9f7
--- /dev/null
+++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/0429b1c7-f69b-4c43-8545-8f063f570e31/values
@@ -0,0 +1,50 @@
+{
+
+
+
+
+
+
+ "creator": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "severity": "minor",
+
+
+
+
+
+
+ "status": "open",
+
+
+
+
+
+
+ "summary": "Collect all articles for a topic",
+
+
+
+
+
+
+ "time": "Fri, 22 Nov 2013 00:41:32 +0000"
+
+
+
+
+
+
+}
diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/1b2264d9-f5e3-4a60-82a7-f26b6bff1070/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/1b2264d9-f5e3-4a60-82a7-f26b6bff1070/values
new file mode 100644
index 0000000..a20e39a
--- /dev/null
+++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/1b2264d9-f5e3-4a60-82a7-f26b6bff1070/values
@@ -0,0 +1,50 @@
+{
+
+
+
+
+
+
+ "creator": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "severity": "minor",
+
+
+
+
+
+
+ "status": "open",
+
+
+
+
+
+
+ "summary": "Write MBOX",
+
+
+
+
+
+
+ "time": "Fri, 22 Nov 2013 00:42:03 +0000"
+
+
+
+
+
+
+}
diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/52c9c6c0-8922-48b7-87a8-73cdc8434e32/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/52c9c6c0-8922-48b7-87a8-73cdc8434e32/values
new file mode 100644
index 0000000..d119fe3
--- /dev/null
+++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/52c9c6c0-8922-48b7-87a8-73cdc8434e32/values
@@ -0,0 +1,50 @@
+{
+
+
+
+
+
+
+ "creator": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "severity": "minor",
+
+
+
+
+
+
+ "status": "open",
+
+
+
+
+
+
+ "summary": "Collect all topics",
+
+
+
+
+
+
+ "time": "Fri, 22 Nov 2013 00:41:20 +0000"
+
+
+
+
+
+
+}
diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/88be1a53-84c7-470a-9a94-29cf753cfd62/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/88be1a53-84c7-470a-9a94-29cf753cfd62/values
new file mode 100644
index 0000000..d7e53fc
--- /dev/null
+++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/88be1a53-84c7-470a-9a94-29cf753cfd62/values
@@ -0,0 +1,50 @@
+{
+
+
+
+
+
+
+ "creator": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "severity": "minor",
+
+
+
+
+
+
+ "status": "open",
+
+
+
+
+
+
+ "summary": "Parse an article",
+
+
+
+
+
+
+ "time": "Fri, 22 Nov 2013 00:41:41 +0000"
+
+
+
+
+
+
+}
diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/d8e67c6a-955f-45c1-a689-326f1cfa2b52/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/d8e67c6a-955f-45c1-a689-326f1cfa2b52/values
new file mode 100644
index 0000000..7afd23c
--- /dev/null
+++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/d8e67c6a-955f-45c1-a689-326f1cfa2b52/values
@@ -0,0 +1,50 @@
+{
+
+
+
+
+
+
+ "creator": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "severity": "minor",
+
+
+
+
+
+
+ "status": "open",
+
+
+
+
+
+
+ "summary": "Convert an article to an email message",
+
+
+
+
+
+
+ "time": "Fri, 22 Nov 2013 00:41:57 +0000"
+
+
+
+
+
+
+}
diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/e66f428c-ee73-4e39-b073-5d6819075e13/values b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/e66f428c-ee73-4e39-b073-5d6819075e13/values
new file mode 100644
index 0000000..4e84001
--- /dev/null
+++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/bugs/e66f428c-ee73-4e39-b073-5d6819075e13/values
@@ -0,0 +1,50 @@
+{
+
+
+
+
+
+
+ "creator": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "reporter": "Mat\u011bj Cepl <mcepl@redhat.com>",
+
+
+
+
+
+
+ "severity": "minor",
+
+
+
+
+
+
+ "status": "open",
+
+
+
+
+
+
+ "summary": "Collect all HREF links on a page",
+
+
+
+
+
+
+ "time": "Fri, 22 Nov 2013 00:48:31 +0000"
+
+
+
+
+
+
+}
diff --git a/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/settings b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/settings
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/.be/5e8651c6-280c-44b1-9a16-017d4ab492f6/settings
@@ -0,0 +1 @@
+{}
diff --git a/.be/version b/.be/version
new file mode 100644
index 0000000..38d39ae
--- /dev/null
+++ b/.be/version
@@ -0,0 +1 @@
+Bugs Everywhere Directory v1.5
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..55292ee
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.be/id-cache
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..e54e06c
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,2 @@
+A small script as a replacement of
+http://saturnboy.com/2010/03/scraping-google-groups/
diff --git a/gg_scrapper.py b/gg_scrapper.py
new file mode 100755
index 0000000..59b81bc
--- /dev/null
+++ b/gg_scrapper.py
@@ -0,0 +1,50 @@
+#!/usr/bin/python
+
+import urllib2
+from bs4 import BeautifulSoup
+import logging
+logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
+ level=logging.DEBUG)
+
+
+class GooglePage:
+ verb_handler = urllib2.HTTPHandler()
+ if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
+ verb_handler.set_http_debuglevel(2)
+ redir_handler = urllib2.HTTPRedirectHandler()
+ opener = urllib2.build_opener(verb_handler, redir_handler)
+
+ def __init__(self, URL):
+ self.bs_page = self.get_first_page_BS(URL)
+
+ @staticmethod
+ def unenscape_Google_bang_URL(old_URL):
+ """
+ See https://developers.google.com/webmasters\
+ /ajax-crawling/docs/getting-started for more information
+ """
+ if old_URL.find('#!') != -1:
+ esc_URL = old_URL.replace('#!', '?_escaped_fragment_=')
+ logging.debug('esc_URL = {}'.format(esc_URL))
+ return esc_URL
+ else:
+ return old_URL
+
+ @classmethod
+ def do_redirect(cls, URL):
+ res = cls.opener.open(URL)
+
+ if res.getcode() == 200:
+ new_URL = res.geturl()
+ logging.debug('url = {}'.format(new_URL))
+ return cls.unenscape_Google_bang_URL(new_URL)
+ else:
+ raise urllib2.HTTPError('Unknown URL: {}'.format(URL))
+
+ def get_first_page_BS(self, URL):
+ with self.opener.open(self.do_redirect(URL)) as esc_res:
+ return BeautifulSoup(esc_res.read())
+
+ def get_topics(self, BS):
+ 'Recursively[?] get all topic (as special objects)'
+ return []
diff --git a/test_gg_scrapper.py b/test_gg_scrapper.py
new file mode 100644
index 0000000..079c3c1
--- /dev/null
+++ b/test_gg_scrapper.py
@@ -0,0 +1,20 @@
+import unittest
+import gg_scrapper
+
+ORIG_URL = 'http://groups.google.com/d/forum/jbrout'
+EXP_URL = 'https://groups.google.com/forum/' + \
+ '?_escaped_fragment_=forum/jbrout'
+
+
+class TestGGScrapper(unittest.TestCase):
+ def test_URL_conversion(self):
+ in_URL = 'https://groups.google.com/forum/#!forum/jbrout'
+ obs_URL = gg_scrapper.GooglePage.unenscape_Google_bang_URL(in_URL)
+ self.assertEqual(obs_URL, EXP_URL)
+
+ def test_do_redirect(self):
+ obs_URL = gg_scrapper.GooglePage.do_redirect(ORIG_URL)
+ self.assertEqual(obs_URL, EXP_URL)
+
+if __name__ == '__main__':
+ unittest.main()