-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_guideline_links.php
54 lines (35 loc) · 1.11 KB
/
extract_guideline_links.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
<?php
require_once('go_wayback.function.php');
chdir('./www.guideline.gov');
$data = [];
foreach(glob('*.html') as $file_name){
$doc = new DOMdocument;
//its messy html, so ignore the load errors...
@$doc->loadhtml(file_get_contents($file_name));
$xpath = new DOMXPath($doc);
$xpath_query = '//h3[@class="results-list-item-title"]/a';
foreach($xpath->query($xpath_query) as $result_title_link){
$anchor = $result_title_link->nodeValue;
$href = $result_title_link->getAttribute('href');
$tmp = [];
$tmp['anchor'] = $anchor;
$tmp['href'] = $href;
$data[] = $tmp;
}
}
//lets download the data from wayback machine..
foreach($data as $i => $row){
$results = go_wayback($row['href'],'.');
$data[$i]['mirror_file'] = $results['saved_to_file'];
$data[$i]['timestamp'] = $results['timestamp'];
}
chdir('..'); //go back to main dir
//lets save a csv file...
$fp = fopen('guidelines_links.csv','w');
fputcsv($fp,['guideline anchor', 'guideline url','mirror_file','timestamp']);
foreach($data as $row){
fputcsv($fp,$row);
}
$Guidelines_MD_text = "
AHRQ Guidelines
";