forked from grangier/python-goose
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Xavier Grangier
committed
Dec 31, 2014
1 parent
530ab52
commit 49f50b0
Showing
129 changed files
with
252 additions
and
0 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
# -*- coding: utf-8 -*- | ||
"""\ | ||
This is a python port of "Goose" orignialy licensed to Gravity.com | ||
under one or more contributor license agreements. See the NOTICE file | ||
distributed with this work for additional information | ||
regarding copyright ownership. | ||
Python port was written by Xavier Grangier for Recrutae | ||
Gravity.com licenses this file | ||
to you under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance | ||
with the License. You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
""" | ||
import os | ||
import json | ||
import urllib2 | ||
import unittest | ||
import socket | ||
|
||
from StringIO import StringIO | ||
|
||
from goose import Goose | ||
from goose.utils import FileHelper | ||
from goose.configuration import Configuration | ||
|
||
|
||
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
|
||
# Response | ||
class MockResponse(): | ||
"""\ | ||
Base mock response class | ||
""" | ||
code = 200 | ||
msg = "OK" | ||
|
||
def __init__(self, cls): | ||
self.cls = cls | ||
|
||
def content(self): | ||
return "response" | ||
|
||
def response(self, req): | ||
data = self.content(req) | ||
url = req.get_full_url() | ||
resp = urllib2.addinfourl(StringIO(data), data, url) | ||
resp.code = self.code | ||
resp.msg = self.msg | ||
return resp | ||
|
||
|
||
class MockHTTPHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler): | ||
"""\ | ||
Mocked HTTPHandler in order to query APIs locally | ||
""" | ||
cls = None | ||
|
||
def https_open(self, req): | ||
return self.http_open(req) | ||
|
||
def http_open(self, req): | ||
r = self.cls.callback(self.cls) | ||
return r.response(req) | ||
|
||
@staticmethod | ||
def patch(cls): | ||
opener = urllib2.build_opener(MockHTTPHandler) | ||
urllib2.install_opener(opener) | ||
# dirty ! | ||
for h in opener.handlers: | ||
if isinstance(h, MockHTTPHandler): | ||
h.cls = cls | ||
return [h for h in opener.handlers if isinstance(h, MockHTTPHandler)][0] | ||
|
||
@staticmethod | ||
def unpatch(): | ||
# urllib2 | ||
urllib2._opener = None | ||
|
||
|
||
class BaseMockTests(unittest.TestCase): | ||
"""\ | ||
Base Mock test case | ||
""" | ||
callback = MockResponse | ||
|
||
def setUp(self): | ||
# patch DNS | ||
self.original_getaddrinfo = socket.getaddrinfo | ||
socket.getaddrinfo = self.new_getaddrinfo | ||
MockHTTPHandler.patch(self) | ||
|
||
def tearDown(self): | ||
MockHTTPHandler.unpatch() | ||
# DNS | ||
socket.getaddrinfo = self.original_getaddrinfo | ||
|
||
def new_getaddrinfo(self, *args): | ||
return [(2, 1, 6, '', ('127.0.0.1', 0))] | ||
|
||
def _get_current_testname(self): | ||
return self.id().split('.')[-1:][0] | ||
|
||
|
||
class MockResponseExtractors(MockResponse): | ||
def content(self, req): | ||
current_test = self.cls._get_current_testname() | ||
path = os.path.join( | ||
os.path.dirname(CURRENT_PATH), | ||
"data", | ||
"extractors", | ||
"content", | ||
"%s.html" % current_test) | ||
path = os.path.abspath(path) | ||
content = FileHelper.loadResourceFile(path) | ||
return content | ||
|
||
|
||
class TestExtractionBase(BaseMockTests): | ||
"""\ | ||
Extraction test case | ||
""" | ||
callback = MockResponseExtractors | ||
|
||
def getRawHtml(self): | ||
test, suite, module, cls, func = self.id().split('.') | ||
path = os.path.join( | ||
os.path.dirname(CURRENT_PATH), | ||
"data", | ||
suite, | ||
module, | ||
"%s.html" % func) | ||
path = os.path.abspath(path) | ||
content = FileHelper.loadResourceFile(path) | ||
return content | ||
|
||
def loadData(self): | ||
"""\ | ||
""" | ||
test, suite, module, cls, func = self.id().split('.') | ||
path = os.path.join( | ||
os.path.dirname(CURRENT_PATH), | ||
"data", | ||
suite, | ||
module, | ||
"%s.json" % func) | ||
path = os.path.abspath(path) | ||
content = FileHelper.loadResourceFile(path) | ||
self.data = json.loads(content) | ||
|
||
def assert_cleaned_text(self, field, expected_value, result_value): | ||
"""\ | ||
""" | ||
# # TODO : handle verbose level in tests | ||
# print "\n=======================::. ARTICLE REPORT %s .::======================\n" % self.id() | ||
# print 'expected_value (%s) \n' % len(expected_value) | ||
# print expected_value | ||
# print "-------" | ||
# print 'result_value (%s) \n' % len(result_value) | ||
# print result_value | ||
|
||
# cleaned_text is Null | ||
msg = u"Resulting article text was NULL!" | ||
self.assertNotEqual(result_value, None, msg=msg) | ||
|
||
# cleaned_text length | ||
msg = u"Article text was not as long as expected beginning!" | ||
self.assertTrue(len(expected_value) <= len(result_value), msg=msg) | ||
|
||
# clean_text value | ||
result_value = result_value[0:len(expected_value)] | ||
msg = u"The beginning of the article text was not as expected!" | ||
self.assertEqual(expected_value, result_value, msg=msg) | ||
|
||
def assert_tags(self, field, expected_value, result_value): | ||
"""\ | ||
""" | ||
# as we have a set in expected_value and a list in result_value | ||
# make result_value a set | ||
expected_value = set(expected_value) | ||
|
||
# check if both have the same number of items | ||
msg = (u"expected tags set and result tags set" | ||
u"don't have the same number of items") | ||
self.assertEqual(len(result_value), len(expected_value), msg=msg) | ||
|
||
# check if each tag in result_value is in expected_value | ||
for tag in result_value: | ||
self.assertTrue(tag in expected_value) | ||
|
||
def runArticleAssertions(self, article, fields): | ||
"""\ | ||
""" | ||
for field in fields: | ||
expected_value = self.data['expected'][field] | ||
result_value = getattr(article, field, None) | ||
|
||
# custom assertion for a given field | ||
assertion = 'assert_%s' % field | ||
if hasattr(self, assertion): | ||
getattr(self, assertion)(field, expected_value, result_value) | ||
continue | ||
|
||
# default assertion | ||
msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) | ||
self.assertEqual(expected_value, result_value, msg=msg) | ||
|
||
def extract(self, instance): | ||
article = instance.extract(url=self.data['url']) | ||
return article | ||
|
||
def getConfig(self): | ||
config = Configuration() | ||
config.enable_image_fetching = False | ||
return config | ||
|
||
def getArticle(self): | ||
"""\ | ||
""" | ||
# load test case data | ||
self.loadData() | ||
|
||
# basic configuration | ||
# no image fetching | ||
config = self.getConfig() | ||
self.parser = config.get_parser() | ||
|
||
# target language | ||
# needed for non english language most of the time | ||
target_language = self.data.get('target_language') | ||
if target_language: | ||
config.target_language = target_language | ||
config.use_meta_language = False | ||
|
||
# run goose | ||
g = Goose(config=config) | ||
return self.extract(g) |
File renamed without changes.
File renamed without changes.
File renamed without changes.