diff --git a/tests/data/extractors/test_allnewlyrics1.html b/tests/data/extractors/content/test_allnewlyrics1.html similarity index 100% rename from tests/data/extractors/test_allnewlyrics1.html rename to tests/data/extractors/content/test_allnewlyrics1.html diff --git a/tests/data/extractors/test_allnewlyrics1.json b/tests/data/extractors/content/test_allnewlyrics1.json similarity index 100% rename from tests/data/extractors/test_allnewlyrics1.json rename to tests/data/extractors/content/test_allnewlyrics1.json diff --git a/tests/data/extractors/test_aolNews.html b/tests/data/extractors/content/test_aolNews.html similarity index 100% rename from tests/data/extractors/test_aolNews.html rename to tests/data/extractors/content/test_aolNews.html diff --git a/tests/data/extractors/test_aolNews.json b/tests/data/extractors/content/test_aolNews.json similarity index 100% rename from tests/data/extractors/test_aolNews.json rename to tests/data/extractors/content/test_aolNews.json diff --git a/tests/data/extractors/test_articlebody_attribute.html b/tests/data/extractors/content/test_articlebody_attribute.html similarity index 100% rename from tests/data/extractors/test_articlebody_attribute.html rename to tests/data/extractors/content/test_articlebody_attribute.html diff --git a/tests/data/extractors/test_articlebody_attribute.json b/tests/data/extractors/content/test_articlebody_attribute.json similarity index 100% rename from tests/data/extractors/test_articlebody_attribute.json rename to tests/data/extractors/content/test_articlebody_attribute.json diff --git a/tests/data/extractors/test_articlebody_itemprop.html b/tests/data/extractors/content/test_articlebody_itemprop.html similarity index 100% rename from tests/data/extractors/test_articlebody_itemprop.html rename to tests/data/extractors/content/test_articlebody_itemprop.html diff --git a/tests/data/extractors/test_articlebody_itemprop.json b/tests/data/extractors/content/test_articlebody_itemprop.json similarity index 100% rename from tests/data/extractors/test_articlebody_itemprop.json rename to tests/data/extractors/content/test_articlebody_itemprop.json diff --git a/tests/data/extractors/test_articlebody_tag.html b/tests/data/extractors/content/test_articlebody_tag.html similarity index 100% rename from tests/data/extractors/test_articlebody_tag.html rename to tests/data/extractors/content/test_articlebody_tag.html diff --git a/tests/data/extractors/test_articlebody_tag.json b/tests/data/extractors/content/test_articlebody_tag.json similarity index 100% rename from tests/data/extractors/test_articlebody_tag.json rename to tests/data/extractors/content/test_articlebody_tag.json diff --git a/tests/data/extractors/test_author_schema.html b/tests/data/extractors/content/test_author_schema.html similarity index 100% rename from tests/data/extractors/test_author_schema.html rename to tests/data/extractors/content/test_author_schema.html diff --git a/tests/data/extractors/test_author_schema.json b/tests/data/extractors/content/test_author_schema.json similarity index 100% rename from tests/data/extractors/test_author_schema.json rename to tests/data/extractors/content/test_author_schema.json diff --git a/tests/data/extractors/test_bbc_chinese.html b/tests/data/extractors/content/test_bbc_chinese.html similarity index 100% rename from tests/data/extractors/test_bbc_chinese.html rename to tests/data/extractors/content/test_bbc_chinese.html diff --git a/tests/data/extractors/test_bbc_chinese.json b/tests/data/extractors/content/test_bbc_chinese.json similarity index 100% rename from tests/data/extractors/test_bbc_chinese.json rename to tests/data/extractors/content/test_bbc_chinese.json diff --git a/tests/data/extractors/test_businessWeek1.html b/tests/data/extractors/content/test_businessWeek1.html similarity index 100% rename from tests/data/extractors/test_businessWeek1.html rename to tests/data/extractors/content/test_businessWeek1.html diff --git a/tests/data/extractors/test_businessWeek1.json b/tests/data/extractors/content/test_businessWeek1.json similarity index 100% rename from tests/data/extractors/test_businessWeek1.json rename to tests/data/extractors/content/test_businessWeek1.json diff --git a/tests/data/extractors/test_businessWeek2.html b/tests/data/extractors/content/test_businessWeek2.html similarity index 100% rename from tests/data/extractors/test_businessWeek2.html rename to tests/data/extractors/content/test_businessWeek2.html diff --git a/tests/data/extractors/test_businessWeek2.json b/tests/data/extractors/content/test_businessWeek2.json similarity index 100% rename from tests/data/extractors/test_businessWeek2.json rename to tests/data/extractors/content/test_businessWeek2.json diff --git a/tests/data/extractors/test_businessWeek3.html b/tests/data/extractors/content/test_businessWeek3.html similarity index 100% rename from tests/data/extractors/test_businessWeek3.html rename to tests/data/extractors/content/test_businessWeek3.html diff --git a/tests/data/extractors/test_businessWeek3.json b/tests/data/extractors/content/test_businessWeek3.json similarity index 100% rename from tests/data/extractors/test_businessWeek3.json rename to tests/data/extractors/content/test_businessWeek3.json diff --git a/tests/data/extractors/test_businessinsider3.html b/tests/data/extractors/content/test_businessinsider3.html similarity index 100% rename from tests/data/extractors/test_businessinsider3.html rename to tests/data/extractors/content/test_businessinsider3.html diff --git a/tests/data/extractors/test_businessinsider3.json b/tests/data/extractors/content/test_businessinsider3.json similarity index 100% rename from tests/data/extractors/test_businessinsider3.json rename to tests/data/extractors/content/test_businessinsider3.json diff --git a/tests/data/extractors/test_cbslocal.html b/tests/data/extractors/content/test_cbslocal.html similarity index 100% rename from tests/data/extractors/test_cbslocal.html rename to tests/data/extractors/content/test_cbslocal.html diff --git a/tests/data/extractors/test_cbslocal.json b/tests/data/extractors/content/test_cbslocal.json similarity index 100% rename from tests/data/extractors/test_cbslocal.json rename to tests/data/extractors/content/test_cbslocal.json diff --git a/tests/data/extractors/test_cnbc1.html b/tests/data/extractors/content/test_cnbc1.html similarity index 100% rename from tests/data/extractors/test_cnbc1.html rename to tests/data/extractors/content/test_cnbc1.html diff --git a/tests/data/extractors/test_cnbc1.json b/tests/data/extractors/content/test_cnbc1.json similarity index 100% rename from tests/data/extractors/test_cnbc1.json rename to tests/data/extractors/content/test_cnbc1.json diff --git a/tests/data/extractors/test_cnet.html b/tests/data/extractors/content/test_cnet.html similarity index 100% rename from tests/data/extractors/test_cnet.html rename to tests/data/extractors/content/test_cnet.html diff --git a/tests/data/extractors/test_cnet.json b/tests/data/extractors/content/test_cnet.json similarity index 100% rename from tests/data/extractors/test_cnet.json rename to tests/data/extractors/content/test_cnet.json diff --git a/tests/data/extractors/test_cnn1.html b/tests/data/extractors/content/test_cnn1.html similarity index 100% rename from tests/data/extractors/test_cnn1.html rename to tests/data/extractors/content/test_cnn1.html diff --git a/tests/data/extractors/test_cnn1.json b/tests/data/extractors/content/test_cnn1.json similarity index 100% rename from tests/data/extractors/test_cnn1.json rename to tests/data/extractors/content/test_cnn1.json diff --git a/tests/data/extractors/test_cnn_arabic.html b/tests/data/extractors/content/test_cnn_arabic.html similarity index 100% rename from tests/data/extractors/test_cnn_arabic.html rename to tests/data/extractors/content/test_cnn_arabic.html diff --git a/tests/data/extractors/test_cnn_arabic.json b/tests/data/extractors/content/test_cnn_arabic.json similarity index 100% rename from tests/data/extractors/test_cnn_arabic.json rename to tests/data/extractors/content/test_cnn_arabic.json diff --git a/tests/data/extractors/test_donga_korean.html b/tests/data/extractors/content/test_donga_korean.html similarity index 100% rename from tests/data/extractors/test_donga_korean.html rename to tests/data/extractors/content/test_donga_korean.html diff --git a/tests/data/extractors/test_donga_korean.json b/tests/data/extractors/content/test_donga_korean.json similarity index 100% rename from tests/data/extractors/test_donga_korean.json rename to tests/data/extractors/content/test_donga_korean.json diff --git a/tests/data/extractors/test_elmondo1.html b/tests/data/extractors/content/test_elmondo1.html similarity index 100% rename from tests/data/extractors/test_elmondo1.html rename to tests/data/extractors/content/test_elmondo1.html diff --git a/tests/data/extractors/test_elmondo1.json b/tests/data/extractors/content/test_elmondo1.json similarity index 100% rename from tests/data/extractors/test_elmondo1.json rename to tests/data/extractors/content/test_elmondo1.json diff --git a/tests/data/extractors/test_elpais.html b/tests/data/extractors/content/test_elpais.html similarity index 100% rename from tests/data/extractors/test_elpais.html rename to tests/data/extractors/content/test_elpais.html diff --git a/tests/data/extractors/test_elpais.json b/tests/data/extractors/content/test_elpais.json similarity index 100% rename from tests/data/extractors/test_elpais.json rename to tests/data/extractors/content/test_elpais.json diff --git a/tests/data/extractors/test_engadget.html b/tests/data/extractors/content/test_engadget.html similarity index 100% rename from tests/data/extractors/test_engadget.html rename to tests/data/extractors/content/test_engadget.html diff --git a/tests/data/extractors/test_engadget.json b/tests/data/extractors/content/test_engadget.json similarity index 100% rename from tests/data/extractors/test_engadget.json rename to tests/data/extractors/content/test_engadget.json diff --git a/tests/data/extractors/test_espn.html b/tests/data/extractors/content/test_espn.html similarity index 100% rename from tests/data/extractors/test_espn.html rename to tests/data/extractors/content/test_espn.html diff --git a/tests/data/extractors/test_espn.json b/tests/data/extractors/content/test_espn.json similarity index 100% rename from tests/data/extractors/test_espn.json rename to tests/data/extractors/content/test_espn.json diff --git a/tests/data/extractors/test_foxNews.html b/tests/data/extractors/content/test_foxNews.html similarity index 100% rename from tests/data/extractors/test_foxNews.html rename to tests/data/extractors/content/test_foxNews.html diff --git a/tests/data/extractors/test_foxNews.json b/tests/data/extractors/content/test_foxNews.json similarity index 100% rename from tests/data/extractors/test_foxNews.json rename to tests/data/extractors/content/test_foxNews.json diff --git a/tests/data/extractors/test_get_canonical_url.html b/tests/data/extractors/content/test_get_canonical_url.html similarity index 100% rename from tests/data/extractors/test_get_canonical_url.html rename to tests/data/extractors/content/test_get_canonical_url.html diff --git a/tests/data/extractors/test_get_canonical_url.json b/tests/data/extractors/content/test_get_canonical_url.json similarity index 100% rename from tests/data/extractors/test_get_canonical_url.json rename to tests/data/extractors/content/test_get_canonical_url.json diff --git a/tests/data/extractors/test_gizmodo1.html b/tests/data/extractors/content/test_gizmodo1.html similarity index 100% rename from tests/data/extractors/test_gizmodo1.html rename to tests/data/extractors/content/test_gizmodo1.html diff --git a/tests/data/extractors/test_gizmodo1.json b/tests/data/extractors/content/test_gizmodo1.json similarity index 100% rename from tests/data/extractors/test_gizmodo1.json rename to tests/data/extractors/content/test_gizmodo1.json diff --git a/tests/data/extractors/test_guardian1.html b/tests/data/extractors/content/test_guardian1.html similarity index 100% rename from tests/data/extractors/test_guardian1.html rename to tests/data/extractors/content/test_guardian1.html diff --git a/tests/data/extractors/test_guardian1.json b/tests/data/extractors/content/test_guardian1.json similarity index 100% rename from tests/data/extractors/test_guardian1.json rename to tests/data/extractors/content/test_guardian1.json diff --git a/tests/data/extractors/test_huffingtonPost2.html b/tests/data/extractors/content/test_huffingtonPost2.html similarity index 100% rename from tests/data/extractors/test_huffingtonPost2.html rename to tests/data/extractors/content/test_huffingtonPost2.html diff --git a/tests/data/extractors/test_huffingtonPost2.json b/tests/data/extractors/content/test_huffingtonPost2.json similarity index 100% rename from tests/data/extractors/test_huffingtonPost2.json rename to tests/data/extractors/content/test_huffingtonPost2.json diff --git a/tests/data/extractors/test_issue115.html b/tests/data/extractors/content/test_issue115.html similarity index 100% rename from tests/data/extractors/test_issue115.html rename to tests/data/extractors/content/test_issue115.html diff --git a/tests/data/extractors/test_issue115.json b/tests/data/extractors/content/test_issue115.json similarity index 100% rename from tests/data/extractors/test_issue115.json rename to tests/data/extractors/content/test_issue115.json diff --git a/tests/data/extractors/test_issue129.html b/tests/data/extractors/content/test_issue129.html similarity index 100% rename from tests/data/extractors/test_issue129.html rename to tests/data/extractors/content/test_issue129.html diff --git a/tests/data/extractors/test_issue129.json b/tests/data/extractors/content/test_issue129.json similarity index 100% rename from tests/data/extractors/test_issue129.json rename to tests/data/extractors/content/test_issue129.json diff --git a/tests/data/extractors/test_issue24.html b/tests/data/extractors/content/test_issue24.html similarity index 100% rename from tests/data/extractors/test_issue24.html rename to tests/data/extractors/content/test_issue24.html diff --git a/tests/data/extractors/test_issue24.json b/tests/data/extractors/content/test_issue24.json similarity index 100% rename from tests/data/extractors/test_issue24.json rename to tests/data/extractors/content/test_issue24.json diff --git a/tests/data/extractors/test_issue25.html b/tests/data/extractors/content/test_issue25.html similarity index 100% rename from tests/data/extractors/test_issue25.html rename to tests/data/extractors/content/test_issue25.html diff --git a/tests/data/extractors/test_issue25.json b/tests/data/extractors/content/test_issue25.json similarity index 100% rename from tests/data/extractors/test_issue25.json rename to tests/data/extractors/content/test_issue25.json diff --git a/tests/data/extractors/test_issue28.html b/tests/data/extractors/content/test_issue28.html similarity index 100% rename from tests/data/extractors/test_issue28.html rename to tests/data/extractors/content/test_issue28.html diff --git a/tests/data/extractors/test_issue28.json b/tests/data/extractors/content/test_issue28.json similarity index 100% rename from tests/data/extractors/test_issue28.json rename to tests/data/extractors/content/test_issue28.json diff --git a/tests/data/extractors/test_issue32.html b/tests/data/extractors/content/test_issue32.html similarity index 100% rename from tests/data/extractors/test_issue32.html rename to tests/data/extractors/content/test_issue32.html diff --git a/tests/data/extractors/test_issue32.json b/tests/data/extractors/content/test_issue32.json similarity index 100% rename from tests/data/extractors/test_issue32.json rename to tests/data/extractors/content/test_issue32.json diff --git a/tests/data/extractors/test_issue4.html b/tests/data/extractors/content/test_issue4.html similarity index 100% rename from tests/data/extractors/test_issue4.html rename to tests/data/extractors/content/test_issue4.html diff --git a/tests/data/extractors/test_issue4.json b/tests/data/extractors/content/test_issue4.json similarity index 100% rename from tests/data/extractors/test_issue4.json rename to tests/data/extractors/content/test_issue4.json diff --git a/tests/data/extractors/test_lefigaro.html b/tests/data/extractors/content/test_lefigaro.html similarity index 100% rename from tests/data/extractors/test_lefigaro.html rename to tests/data/extractors/content/test_lefigaro.html diff --git a/tests/data/extractors/test_lefigaro.json b/tests/data/extractors/content/test_lefigaro.json similarity index 100% rename from tests/data/extractors/test_lefigaro.json rename to tests/data/extractors/content/test_lefigaro.json diff --git a/tests/data/extractors/test_liberation.html b/tests/data/extractors/content/test_liberation.html similarity index 100% rename from tests/data/extractors/test_liberation.html rename to tests/data/extractors/content/test_liberation.html diff --git a/tests/data/extractors/test_liberation.json b/tests/data/extractors/content/test_liberation.json similarity index 100% rename from tests/data/extractors/test_liberation.json rename to tests/data/extractors/content/test_liberation.json diff --git a/tests/data/extractors/test_links.html b/tests/data/extractors/content/test_links.html similarity index 100% rename from tests/data/extractors/test_links.html rename to tests/data/extractors/content/test_links.html diff --git a/tests/data/extractors/test_links.json b/tests/data/extractors/content/test_links.json similarity index 100% rename from tests/data/extractors/test_links.json rename to tests/data/extractors/content/test_links.json diff --git a/tests/data/extractors/test_marketplace.html b/tests/data/extractors/content/test_marketplace.html similarity index 100% rename from tests/data/extractors/test_marketplace.html rename to tests/data/extractors/content/test_marketplace.html diff --git a/tests/data/extractors/test_marketplace.json b/tests/data/extractors/content/test_marketplace.json similarity index 100% rename from tests/data/extractors/test_marketplace.json rename to tests/data/extractors/content/test_marketplace.json diff --git a/tests/data/extractors/test_mashable_issue_74.html b/tests/data/extractors/content/test_mashable_issue_74.html similarity index 100% rename from tests/data/extractors/test_mashable_issue_74.html rename to tests/data/extractors/content/test_mashable_issue_74.html diff --git a/tests/data/extractors/test_mashable_issue_74.json b/tests/data/extractors/content/test_mashable_issue_74.json similarity index 100% rename from tests/data/extractors/test_mashable_issue_74.json rename to tests/data/extractors/content/test_mashable_issue_74.json diff --git a/tests/data/extractors/test_msn1.html b/tests/data/extractors/content/test_msn1.html similarity index 100% rename from tests/data/extractors/test_msn1.html rename to tests/data/extractors/content/test_msn1.html diff --git a/tests/data/extractors/test_msn1.json b/tests/data/extractors/content/test_msn1.json similarity index 100% rename from tests/data/extractors/test_msn1.json rename to tests/data/extractors/content/test_msn1.json diff --git a/tests/data/extractors/test_okaymarketing.html b/tests/data/extractors/content/test_okaymarketing.html similarity index 100% rename from tests/data/extractors/test_okaymarketing.html rename to tests/data/extractors/content/test_okaymarketing.html diff --git a/tests/data/extractors/test_okaymarketing.json b/tests/data/extractors/content/test_okaymarketing.json similarity index 100% rename from tests/data/extractors/test_okaymarketing.json rename to tests/data/extractors/content/test_okaymarketing.json diff --git a/tests/data/extractors/test_opengraph.html b/tests/data/extractors/content/test_opengraph.html similarity index 100% rename from tests/data/extractors/test_opengraph.html rename to tests/data/extractors/content/test_opengraph.html diff --git a/tests/data/extractors/test_opengraph.json b/tests/data/extractors/content/test_opengraph.json similarity index 100% rename from tests/data/extractors/test_opengraph.json rename to tests/data/extractors/content/test_opengraph.json diff --git a/tests/data/extractors/test_politico.html b/tests/data/extractors/content/test_politico.html similarity index 100% rename from tests/data/extractors/test_politico.html rename to tests/data/extractors/content/test_politico.html diff --git a/tests/data/extractors/test_politico.json b/tests/data/extractors/content/test_politico.json similarity index 100% rename from tests/data/extractors/test_politico.json rename to tests/data/extractors/content/test_politico.json diff --git a/tests/data/extractors/test_publish_date.html b/tests/data/extractors/content/test_publish_date.html similarity index 100% rename from tests/data/extractors/test_publish_date.html rename to tests/data/extractors/content/test_publish_date.html diff --git a/tests/data/extractors/test_publish_date.json b/tests/data/extractors/content/test_publish_date.json similarity index 100% rename from tests/data/extractors/test_publish_date.json rename to tests/data/extractors/content/test_publish_date.json diff --git a/tests/data/extractors/test_publish_date_article.html b/tests/data/extractors/content/test_publish_date_article.html similarity index 100% rename from tests/data/extractors/test_publish_date_article.html rename to tests/data/extractors/content/test_publish_date_article.html diff --git a/tests/data/extractors/test_publish_date_article.json b/tests/data/extractors/content/test_publish_date_article.json similarity index 100% rename from tests/data/extractors/test_publish_date_article.json rename to tests/data/extractors/content/test_publish_date_article.json diff --git a/tests/data/extractors/test_publish_date_rnews.html b/tests/data/extractors/content/test_publish_date_rnews.html similarity index 100% rename from tests/data/extractors/test_publish_date_rnews.html rename to tests/data/extractors/content/test_publish_date_rnews.html diff --git a/tests/data/extractors/test_publish_date_rnews.json b/tests/data/extractors/content/test_publish_date_rnews.json similarity index 100% rename from tests/data/extractors/test_publish_date_rnews.json rename to tests/data/extractors/content/test_publish_date_rnews.json diff --git a/tests/data/extractors/test_publish_date_schema.html b/tests/data/extractors/content/test_publish_date_schema.html similarity index 100% rename from tests/data/extractors/test_publish_date_schema.html rename to tests/data/extractors/content/test_publish_date_schema.html diff --git a/tests/data/extractors/test_publish_date_schema.json b/tests/data/extractors/content/test_publish_date_schema.json similarity index 100% rename from tests/data/extractors/test_publish_date_schema.json rename to tests/data/extractors/content/test_publish_date_schema.json diff --git a/tests/data/extractors/test_tags_abcau.html b/tests/data/extractors/content/test_tags_abcau.html similarity index 100% rename from tests/data/extractors/test_tags_abcau.html rename to tests/data/extractors/content/test_tags_abcau.html diff --git a/tests/data/extractors/test_tags_abcau.json b/tests/data/extractors/content/test_tags_abcau.json similarity index 100% rename from tests/data/extractors/test_tags_abcau.json rename to tests/data/extractors/content/test_tags_abcau.json diff --git a/tests/data/extractors/test_tags_cnet.html b/tests/data/extractors/content/test_tags_cnet.html similarity index 100% rename from tests/data/extractors/test_tags_cnet.html rename to tests/data/extractors/content/test_tags_cnet.html diff --git a/tests/data/extractors/test_tags_cnet.json b/tests/data/extractors/content/test_tags_cnet.json similarity index 100% rename from tests/data/extractors/test_tags_cnet.json rename to tests/data/extractors/content/test_tags_cnet.json diff --git a/tests/data/extractors/test_tags_deadline.html b/tests/data/extractors/content/test_tags_deadline.html similarity index 100% rename from tests/data/extractors/test_tags_deadline.html rename to tests/data/extractors/content/test_tags_deadline.html diff --git a/tests/data/extractors/test_tags_deadline.json b/tests/data/extractors/content/test_tags_deadline.json similarity index 100% rename from tests/data/extractors/test_tags_deadline.json rename to tests/data/extractors/content/test_tags_deadline.json diff --git a/tests/data/extractors/test_tags_kexp.html b/tests/data/extractors/content/test_tags_kexp.html similarity index 100% rename from tests/data/extractors/test_tags_kexp.html rename to tests/data/extractors/content/test_tags_kexp.html diff --git a/tests/data/extractors/test_tags_kexp.json b/tests/data/extractors/content/test_tags_kexp.json similarity index 100% rename from tests/data/extractors/test_tags_kexp.json rename to tests/data/extractors/content/test_tags_kexp.json diff --git a/tests/data/extractors/test_tags_wnyc.html b/tests/data/extractors/content/test_tags_wnyc.html similarity index 100% rename from tests/data/extractors/test_tags_wnyc.html rename to tests/data/extractors/content/test_tags_wnyc.html diff --git a/tests/data/extractors/test_tags_wnyc.json b/tests/data/extractors/content/test_tags_wnyc.json similarity index 100% rename from tests/data/extractors/test_tags_wnyc.json rename to tests/data/extractors/content/test_tags_wnyc.json diff --git a/tests/data/extractors/test_techcrunch1.html b/tests/data/extractors/content/test_techcrunch1.html similarity index 100% rename from tests/data/extractors/test_techcrunch1.html rename to tests/data/extractors/content/test_techcrunch1.html diff --git a/tests/data/extractors/test_techcrunch1.json b/tests/data/extractors/content/test_techcrunch1.json similarity index 100% rename from tests/data/extractors/test_techcrunch1.json rename to tests/data/extractors/content/test_techcrunch1.json diff --git a/tests/data/extractors/test_testHuffingtonPost.html b/tests/data/extractors/content/test_testHuffingtonPost.html similarity index 100% rename from tests/data/extractors/test_testHuffingtonPost.html rename to tests/data/extractors/content/test_testHuffingtonPost.html diff --git a/tests/data/extractors/test_testHuffingtonPost.json b/tests/data/extractors/content/test_testHuffingtonPost.json similarity index 100% rename from tests/data/extractors/test_testHuffingtonPost.json rename to tests/data/extractors/content/test_testHuffingtonPost.json diff --git a/tests/data/extractors/test_time.html b/tests/data/extractors/content/test_time.html similarity index 100% rename from tests/data/extractors/test_time.html rename to tests/data/extractors/content/test_time.html diff --git a/tests/data/extractors/test_time.json b/tests/data/extractors/content/test_time.json similarity index 100% rename from tests/data/extractors/test_time.json rename to tests/data/extractors/content/test_time.json diff --git a/tests/data/extractors/test_time2.html b/tests/data/extractors/content/test_time2.html similarity index 100% rename from tests/data/extractors/test_time2.html rename to tests/data/extractors/content/test_time2.html diff --git a/tests/data/extractors/test_time2.json b/tests/data/extractors/content/test_time2.json similarity index 100% rename from tests/data/extractors/test_time2.json rename to tests/data/extractors/content/test_time2.json diff --git a/tests/data/extractors/test_title_opengraph.html b/tests/data/extractors/content/test_title_opengraph.html similarity index 100% rename from tests/data/extractors/test_title_opengraph.html rename to tests/data/extractors/content/test_title_opengraph.html diff --git a/tests/data/extractors/test_title_opengraph.json b/tests/data/extractors/content/test_title_opengraph.json similarity index 100% rename from tests/data/extractors/test_title_opengraph.json rename to tests/data/extractors/content/test_title_opengraph.json diff --git a/tests/data/extractors/test_tweet.html b/tests/data/extractors/content/test_tweet.html similarity index 100% rename from tests/data/extractors/test_tweet.html rename to tests/data/extractors/content/test_tweet.html diff --git a/tests/data/extractors/test_tweet.json b/tests/data/extractors/content/test_tweet.json similarity index 100% rename from tests/data/extractors/test_tweet.json rename to tests/data/extractors/content/test_tweet.json diff --git a/tests/data/extractors/test_usatoday_issue_74.html b/tests/data/extractors/content/test_usatoday_issue_74.html similarity index 100% rename from tests/data/extractors/test_usatoday_issue_74.html rename to tests/data/extractors/content/test_usatoday_issue_74.html diff --git a/tests/data/extractors/test_usatoday_issue_74.json b/tests/data/extractors/content/test_usatoday_issue_74.json similarity index 100% rename from tests/data/extractors/test_usatoday_issue_74.json rename to tests/data/extractors/content/test_usatoday_issue_74.json diff --git a/tests/data/extractors/test_yahoo.html b/tests/data/extractors/content/test_yahoo.html similarity index 100% rename from tests/data/extractors/test_yahoo.html rename to tests/data/extractors/content/test_yahoo.html diff --git a/tests/data/extractors/test_yahoo.json b/tests/data/extractors/content/test_yahoo.json similarity index 100% rename from tests/data/extractors/test_yahoo.json rename to tests/data/extractors/content/test_yahoo.json diff --git a/tests/data/videos/test_embed.html b/tests/data/extractors/videos/test_embed.html similarity index 100% rename from tests/data/videos/test_embed.html rename to tests/data/extractors/videos/test_embed.html diff --git a/tests/data/videos/test_embed.json b/tests/data/extractors/videos/test_embed.json similarity index 100% rename from tests/data/videos/test_embed.json rename to tests/data/extractors/videos/test_embed.json diff --git a/tests/data/videos/test_iframe.html b/tests/data/extractors/videos/test_iframe.html similarity index 100% rename from tests/data/videos/test_iframe.html rename to tests/data/extractors/videos/test_iframe.html diff --git a/tests/data/videos/test_iframe.json b/tests/data/extractors/videos/test_iframe.json similarity index 100% rename from tests/data/videos/test_iframe.json rename to tests/data/extractors/videos/test_iframe.json diff --git a/tests/data/videos/test_object.html b/tests/data/extractors/videos/test_object.html similarity index 100% rename from tests/data/videos/test_object.html rename to tests/data/extractors/videos/test_object.html diff --git a/tests/data/videos/test_object.json b/tests/data/extractors/videos/test_object.json similarity index 100% rename from tests/data/videos/test_object.json rename to tests/data/extractors/videos/test_object.json diff --git a/tests/extractors/__init__.py b/tests/extractors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/extractors/base.py b/tests/extractors/base.py new file mode 100644 index 00000000..60990b77 --- /dev/null +++ b/tests/extractors/base.py @@ -0,0 +1,252 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import os +import json +import urllib2 +import unittest +import socket + +from StringIO import StringIO + +from goose import Goose +from goose.utils import FileHelper +from goose.configuration import Configuration + + +CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) + + +# Response +class MockResponse(): + """\ + Base mock response class + """ + code = 200 + msg = "OK" + + def __init__(self, cls): + self.cls = cls + + def content(self): + return "response" + + def response(self, req): + data = self.content(req) + url = req.get_full_url() + resp = urllib2.addinfourl(StringIO(data), data, url) + resp.code = self.code + resp.msg = self.msg + return resp + + +class MockHTTPHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler): + """\ + Mocked HTTPHandler in order to query APIs locally + """ + cls = None + + def https_open(self, req): + return self.http_open(req) + + def http_open(self, req): + r = self.cls.callback(self.cls) + return r.response(req) + + @staticmethod + def patch(cls): + opener = urllib2.build_opener(MockHTTPHandler) + urllib2.install_opener(opener) + # dirty ! + for h in opener.handlers: + if isinstance(h, MockHTTPHandler): + h.cls = cls + return [h for h in opener.handlers if isinstance(h, MockHTTPHandler)][0] + + @staticmethod + def unpatch(): + # urllib2 + urllib2._opener = None + + +class BaseMockTests(unittest.TestCase): + """\ + Base Mock test case + """ + callback = MockResponse + + def setUp(self): + # patch DNS + self.original_getaddrinfo = socket.getaddrinfo + socket.getaddrinfo = self.new_getaddrinfo + MockHTTPHandler.patch(self) + + def tearDown(self): + MockHTTPHandler.unpatch() + # DNS + socket.getaddrinfo = self.original_getaddrinfo + + def new_getaddrinfo(self, *args): + return [(2, 1, 6, '', ('127.0.0.1', 0))] + + def _get_current_testname(self): + return self.id().split('.')[-1:][0] + + +class MockResponseExtractors(MockResponse): + def content(self, req): + current_test = self.cls._get_current_testname() + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + "extractors", + "content", + "%s.html" % current_test) + path = os.path.abspath(path) + content = FileHelper.loadResourceFile(path) + return content + + +class TestExtractionBase(BaseMockTests): + """\ + Extraction test case + """ + callback = MockResponseExtractors + + def getRawHtml(self): + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + "%s.html" % func) + path = os.path.abspath(path) + content = FileHelper.loadResourceFile(path) + return content + + def loadData(self): + """\ + + """ + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + "%s.json" % func) + path = os.path.abspath(path) + content = FileHelper.loadResourceFile(path) + self.data = json.loads(content) + + def assert_cleaned_text(self, field, expected_value, result_value): + """\ + + """ + # # TODO : handle verbose level in tests + # print "\n=======================::. ARTICLE REPORT %s .::======================\n" % self.id() + # print 'expected_value (%s) \n' % len(expected_value) + # print expected_value + # print "-------" + # print 'result_value (%s) \n' % len(result_value) + # print result_value + + # cleaned_text is Null + msg = u"Resulting article text was NULL!" + self.assertNotEqual(result_value, None, msg=msg) + + # cleaned_text length + msg = u"Article text was not as long as expected beginning!" + self.assertTrue(len(expected_value) <= len(result_value), msg=msg) + + # clean_text value + result_value = result_value[0:len(expected_value)] + msg = u"The beginning of the article text was not as expected!" + self.assertEqual(expected_value, result_value, msg=msg) + + def assert_tags(self, field, expected_value, result_value): + """\ + + """ + # as we have a set in expected_value and a list in result_value + # make result_value a set + expected_value = set(expected_value) + + # check if both have the same number of items + msg = (u"expected tags set and result tags set" + u"don't have the same number of items") + self.assertEqual(len(result_value), len(expected_value), msg=msg) + + # check if each tag in result_value is in expected_value + for tag in result_value: + self.assertTrue(tag in expected_value) + + def runArticleAssertions(self, article, fields): + """\ + + """ + for field in fields: + expected_value = self.data['expected'][field] + result_value = getattr(article, field, None) + + # custom assertion for a given field + assertion = 'assert_%s' % field + if hasattr(self, assertion): + getattr(self, assertion)(field, expected_value, result_value) + continue + + # default assertion + msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) + self.assertEqual(expected_value, result_value, msg=msg) + + def extract(self, instance): + article = instance.extract(url=self.data['url']) + return article + + def getConfig(self): + config = Configuration() + config.enable_image_fetching = False + return config + + def getArticle(self): + """\ + + """ + # load test case data + self.loadData() + + # basic configuration + # no image fetching + config = self.getConfig() + self.parser = config.get_parser() + + # target language + # needed for non english language most of the time + target_language = self.data.get('target_language') + if target_language: + config.target_language = target_language + config.use_meta_language = False + + # run goose + g = Goose(config=config) + return self.extract(g) diff --git a/tests/extractors.py b/tests/extractors/content.py similarity index 100% rename from tests/extractors.py rename to tests/extractors/content.py diff --git a/tests/images.py b/tests/extractors/images.py similarity index 100% rename from tests/images.py rename to tests/extractors/images.py diff --git a/tests/videos.py b/tests/extractors/videos.py similarity index 100% rename from tests/videos.py rename to tests/extractors/videos.py