-
Notifications
You must be signed in to change notification settings - Fork 1
/
Search.java
248 lines (227 loc) · 7.87 KB
/
Search.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import java.net.*;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
public class Search {
//data members for class
private String url = "";
private String itemUrl = "";
private String primaryRegex = "";
private String isbnRegex = "";
private String titleRegex = "";
private String authorRegex = "";
private String genreRegex = "";
private String priceRegex = "";
private String advancedSearchRegex = "";
private ArrayList<String> allTitleHrefResults;
private BookStorage bookStorage = null;
private BufferedWriter writer = null;
public Search(BookStorage bookStorage){
this.bookStorage = bookStorage;
//regex patterns for data scraping
url = "https://www.barnesandnoble.com/s/";
itemUrl = "https://www.barnesandnoble.com/";
isbnRegex = "(\\.setTargeting\\('sku', '(.*?)\')";
titleRegex = "(\\.setTargeting\\('title', '(.*?)\')";
authorRegex = "(\\.setTargeting\\('author', '(.*?)\')";
genreRegex = "(\\.setTargeting\\('cat1', '(.*?)\')";
priceRegex = "(Current price is (.*?),)";
//primary regex weeds out books of a certain title for initial searching
primaryRegex = "(<a class=\"pImageLink \".*?href=\"(.*?);.*?Author?)";
//holds url link for a book to later concatenate with url string
allTitleHrefResults = new ArrayList<String>();
}
public String getinfo(String itemUrl, String titleUrlFile) throws IOException {
//function will get url info and store it into a text file for future processing
//this gets info for one book title with many results
String itemHtml = "";
// Display the URL address, and information about it.
BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(itemUrl).openStream()));
String line = reader.readLine();
//gather all lines of html
while (line != null) {
itemHtml = itemHtml + line;
line = reader.readLine();
} // while
reader.close();
return itemHtml;
}
public void titleSearch(String title, BufferedWriter outFile){
//search B&N based on titles given by the user
String titleUrl = url + title;
String titleUrlFile = title + ".txt";
String itemHtml= "";
try {
try {
//write to output file
outFile.write("Searched Barnes & Noble Url: " + titleUrl + "\n");
} catch (IOException e) {
e.printStackTrace();
}
//get the entire html for searched title which includes all books
//with that title
itemHtml = getinfo(titleUrl, titleUrlFile);
//send html to function for parsing to find first book in list
//that matches title the best
parseHtml(itemHtml);
} catch (IOException e) {
e.printStackTrace();
}
}
private void parseHtml(String htmlToParse){
//send html to regex matching to filter out results
//primary regex returns info for one book
allTitleHrefResults.add(patternMatcher(htmlToParse, primaryRegex));
}
private String patternMatcher(String parse, String regex) {
//all regex matching occurs here
String info = "";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(parse);
if(matcher.find()){
info = matcher.group(2);
}
//return indicated group
return info;
}
public void parseIndividualItems() {
//this will go through all titles needing to be search
for(int i=0; i<allTitleHrefResults.size(); i++){
getBookInfo(allTitleHrefResults.get(i));
}
}
private void getBookInfo(String bookUrl) {
//given a title, function will scrape a new webpage to get just
//the information of that ONE book matched from the results of many
String isbn = "", title = "", author = "", genre = "", price = "";
//B&N url concatenated with books search link
String url = itemUrl + bookUrl;
String bookHtml = "";
String line;
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(url).openStream()));
line = reader.readLine();
while (line != null) {
//gather this books html
bookHtml = bookHtml + line;
line = reader.readLine();
} // while
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
//use regex to parse out required information for storing
isbn = patternMatcher(bookHtml, isbnRegex);
title = patternMatcher(bookHtml, titleRegex);
author = patternMatcher(bookHtml, authorRegex);
genre = patternMatcher(bookHtml, genreRegex);
price = patternMatcher(bookHtml, priceRegex);
//store the book
bookStorage.storeData(isbn, title, author, genre, price);
}
public int advancedBookSearch(String title, String isbn) {
//function is used for the advanced search feature
String titleUrl = url + title;
String titleUrlFile = title + ".txt";
String itemHtml = "";
int checkSearch = 0;
try {
itemHtml = getinfo(titleUrl, titleUrlFile); //returns webpages with all results
checkSearch = getAdvancedBookInfo(itemHtml, isbn); //search for exact book
if(checkSearch == 1) {
return 1; //return 1 if no result is found
}
} catch (IOException e) {
e.printStackTrace();
}
return 0;
}
private int getAdvancedBookInfo(String itemHtml, String isbnAS) {
//given advanced search params, find exact book
String bookHtmlAS = findCorrectBook(itemHtml, isbnAS);
if(bookHtmlAS.equals("no match")){
return 1; //if no book is found
}
else{
getBookInfo(bookHtmlAS); //scrape info on that book
}
return 0;
}
private String findCorrectBook(String bookHtml, String isbn){
//function will find exact book in the list of results given by B&N
//this is for advanced search feature
String info = "";
advancedSearchRegex = "(<a class=\"pImageLink \".*?href=\"(.*?);.*?ean=("+isbn+")\">.*?Author?)";
Pattern pattern = Pattern.compile(advancedSearchRegex);
Matcher matcher = pattern.matcher(bookHtml);
if(matcher.find()){
info = matcher.group(2);
}
else{
//no book matches what user searched for
return "no match";
}
return info;
}
public BufferedImage featuredItem() {
//read image from the image url on website
String imageUrl = "https://prodimage.images-bn.com/pimages/9780785168508_p0_v2_s550x406.jpg";
URL urlImg = null;
try {
urlImg = new URL(imageUrl);
} catch (MalformedURLException e1) {
e1.printStackTrace();
}
//read and write image
BufferedImage image = null;
try {
image = ImageIO.read(urlImg);
} catch (IOException e) {
e.printStackTrace();
} // catch
return image;
}
public void getUrlInfo(ArrayList<String> urls) {
// generates output file for url searching
String outputFile = "output.txt";
try {
writer = new BufferedWriter(new FileWriter(outputFile));
} catch (IOException e1) {
e1.printStackTrace();
}
//gets url extention
for(int i=0; i<urls.size(); i++) {
String[] urlType = urls.get(i).split("\\.");
if(urlType[urlType.length-1].equals("html") || urlType[urlType.length-1].equals("htm") || urlType[urlType.length-1].equals("txt")){
readHtml(urls.get(i));
}
}
try {
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private void readHtml(String url) {
//reads the html for urls that are searched
int numOfLines = 0;
try {
// Display the URL address, and information about it.
BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(url).openStream()));
String line = reader.readLine();
writer.write(url + "\n");
while (line != null) {
writer.write(line + "\n");
numOfLines++;
line = reader.readLine();
} // while
writer.write(numOfLines + "\n");
writer.write("\n\n\n");
} catch (IOException e) {
e.printStackTrace();
}
}
} // end Search class