forked from CareSet/AHRQ_search_clone
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_expert_commentary.php
81 lines (60 loc) · 2.37 KB
/
get_expert_commentary.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
<?php
$dir = "expert_commentary";
//parse the following html files, looking for anchors with links
//matching the paried text.. then download the paired text.
$worklist = [
'./one_off_mirror/www.guideline.gov/expert.html' => 'expert-commentary',
'./one_off_mirror/www.guideline.gov/syntheses/index.html' => 'synthesis',
'./one_off_mirror/www.qualitymeasures.ahrq.gov/expert.html' => 'expert-commentary',
];
$data = [];
foreach($worklist as $source_file => $filter){
$doc = new DOMdocument;
//its messy html, so ignore the load errors...
// echo "Starting download...";
$html_text = file_get_contents($source_file);
// echo "done. \nLoading DOM..";
@$doc->loadhtml($html_text);
$xpath = new DOMXPath($doc);
// echo "done. \n";
$xpath_query = '//a';
// echo "Procesing matching anchors\n";
foreach($xpath->query($xpath_query) as $this_link){
$anchor = $this_link->nodeValue;
$href = $this_link->getAttribute('href');
// echo "found $href\n";
if(strpos($href,$filter) !== false){
// echo "This link matches our current filter:$filter!! so we are going to mirror it!!\n";
$tmp = [];
$tmp['anchor'] = $anchor;
$tmp['href'] = $href;
$data[] = $tmp;
}
}
}
chdir("./$dir");
foreach($data as $i => $row){
$href = $row['href']; //now I have href..
$parsed = parse_url($href);
$domain_name = $parsed['host'];
$pathinfo = pathinfo($parsed['path']);
$dirname = $pathinfo['dirname'];
$basename = $pathinfo['basename'];
$out_file_name = substr($basename,0,245); //file names can be too long..
$file_to_test = "./$domain_name/$dirname/$out_file_name.html";
echo "Looking for $file_to_test\n";
if(!file_exists($file_to_test)){
echo "Missing file\n";
$cmd = "wget -O $file_to_test --span-hosts --backup-converted --timestamping --page-requisites $href";
system($cmd);
echo "
###########################################################################################################
###########################################################################################################
FINISHED $href
###########################################################################################################
###########################################################################################################
\n\n";
}else{
echo "Got file $file_to_test... moving right along...\n";
}
}