
Amazon Scraper
Github
Languages & Tools: Java, Jsoup
Two programs: Using an amazon category or search URL, the first program downloads each amazon product link to a text file.
The second program parses all the information from each product link, and downloads them to a CSV Spreadsheet file.
Sample Code
public void getPrice(){
try{
Element prices = document.getElementById("priceblock_ourprice");
price = prices.text().replace("$ ", "").replace(" ", ".").replace("$", "");
}catch(Exception e){
}
}
public void getReviews(){
try{
Element custReviews = document.getElementById("acrCustomerReviewLink");
String customer = custReviews.text();
customerReviews = customer;
}catch(Exception e){
}
try{
Elements reviews = document.getElementsByClass("a-row review-data");
String words5 = reviews.first().html();
String temp5 = words5.replace("
", "$$$");
Document doc5 = Jsoup.parse(temp5);
String desc5 = doc5.body().text().replace("$$$", "n").replace("Read more", "").toString();
review = desc5;
}catch(Exception e){
}
}
public void getCategories(){
try{ Element navigation = document.getElementById("wayfinding-breadcrumbs_container");
String navEdit = navigation.text().replaceAll(" › ", "|");
postCategory = navEdit + "|" + getBy;
}
catch(Exception e){
postCategory = getBy;
}
}
public void getImageString(){
try {
Elements test = document.select("div#leftCol img[src]");
imageString = "div#leftCol img[src]";
}catch(Exception e){
imageString = "div#altImages img[src]";
}
}
public void getProductImages(){
try{
Elements imageE = document.select(imageString);
String image = imageE.attr("src") ;
featuredImage = image;
if (image.contains("G/01/")||!image.startsWith("http")){
System.out.println("There is no featured image... Skipped....");
skip = true;
noImg = true;
}
}catch(Exception e){
System.out.println("getProduct Images failed");
}
}