package spider;

import java.net.*;
import java.io.*;
import java.util.*;
import javax.swing.text.html.parser.ParserDelegator;

import data.*;

/**
 * 		Spider		
 */
public class Spider{
	static Queue queue;
	static Cache cache;
	int qsize;
	Hparser hparse; 
	URL currenturl;

/**
 * 			
 */
    public Spider() {
		queue = new Queue();
		cache = new Cache(); 
 		hparse = new Hparser();
	}

	public String addSeed(String urlstring){
		if(urlstring.equals("")) return "";
		if(Index.db.checkRecord(urlstring, "", 0)) return "Already Scanned";
		String scanres = scanPage(urlstring);
		if(scanres.equals("bad")) return "Bad URL"; 
		String domain = urlstring.substring(0, urlstring.lastIndexOf('/'));
		if(Index.domains.isEmpty()) Index.domains.add(domain);
		if(!Index.domains.contains(domain)) Index.domains.add(domain);	
		if(queue.addLink("",urlstring,"")){
				return "URL Added";
		} else return "Already in Queue";
	}

	public int Crawl(){
		String file = new String();
		String URLbase = new String();
		String urlstring = new String(queue.getNextURL());
		int currentinlinks = queue.getNextInlink();
		String currentlinktext = new String(queue.getNextLinktext());
		hparse = new Hparser(); 
		System.out.println("\nLookup : "+urlstring);
		if(!Index.db.checkRecord(urlstring, currentlinktext,1)){
			System.out.println("New page = "+urlstring);
			urlstring = scanPage(urlstring);
			if(!urlstring.equals("bad")){
				// Get the URL base
				URLbase = currenturl.getProtocol()+"://";
				URLbase += currenturl.getHost();
				file = currenturl.getFile();
				int dirend = file.lastIndexOf("/") > file.lastIndexOf("\\") ? file.lastIndexOf("/") : file.lastIndexOf("\\");
				// expression ? op1 : op2
				// The ?: operator evaluates expression and returns op1 if it's true and op2 if it's false. 
				URLbase += file.substring(0,dirend)+"/";
				//  http://www.host.com/THIS/PART/filename.htm
				Vector links = hparse.getLinks();
				int outlinks;
				if(!links.isEmpty()){ 
					outlinks = links.size()/2;
					for (Enumeration e = hparse.getLinks().elements() ; e.hasMoreElements();){
						queue.addLink(URLbase, (String)e.nextElement(), (String)e.nextElement());
					}
				} else outlinks = 0;
				Index.db.addSite(urlstring, hparse.getTitle(), hparse.getDescription(), hparse.getKeywords(), hparse.getPagetext(), new 					Integer(currentinlinks), currentlinktext, outlinks);
				System.out.println("Added : "+hparse.getTitle());
				Index.sitecount++;
				System.out.println("Total Pages = "+Index.sitecount);
				if(Index.sitecount%Index.REFRESH == 0){
					cache.refresh();
					System.gc(); // Garbage Collection
				} 
			}
		 }
	return queue.getSize();
	}
		
	public String scanPage(String urlstring){
		String httpresp = new String();
		String status = new String("good");
		ParserDelegator pd = new ParserDelegator();
		try{
			currenturl = new URL(urlstring);
			urlstring = currenturl.toString();// removes /../ from URL
			if(!currenturl.getProtocol().equals("http")){
				status = currenturl.getProtocol()+" protocol";
			} else {
				URLConnection conn = currenturl.openConnection();
				HttpURLConnection httpconn = (HttpURLConnection)conn;


				if(httpconn.getResponseCode() == HttpURLConnection.HTTP_OK){
					if(httpconn.getContentType().equals("text/html")){
						System.out.println("OK, parsing...");
						InputStreamReader isr = new InputStreamReader(conn.getInputStream());
						BufferedReader in = new BufferedReader(isr);
						pd.parse(in, hparse, true);
						in.close();
						isr.close();
						httpconn.disconnect();
					} else status = "Not text/html";
				} else status = "bad";
			  }
				} catch (MalformedURLException mue) {
	  				status = mue.toString();	
			      }
	       	  	  catch(java.net.UnknownHostException uh){
					status = uh.toString();			// Mark as a bad URL
	       	  	  }
	       	  	  catch(java.io.IOException ioe){
					status = ioe.toString();			// Mark as a bad URL
	       	  	  }
				if(status.equals("good")){
					return urlstring;
				} else{
					System.out.println("Bad URL = "+urlstring);
					return "bad";
				  }  
	}
}
