package spider;

import javax.swing.text.html.HTMLEditorKit.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import java.util.*;
import java.text.*;

/**
 *			Parses HTML Pages	
 */
public class Hparser extends ParserCallback{

	String title, description, keywords, pagetext, linktext;
	Vector links;
	char state;
	static final char NONE = 0;
	static final char TITLE = 1;  
	static final char HREF = 2;  
	
/**
 *		Constructor - initializes string variables
 */	
	public Hparser(){
		title = new String();
		description = new String();
		keywords = new String();
		pagetext = new String();
		linktext = new String();
		links = new Vector();
	}

/**
 *		Handle standalone tags
 */
	public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attribs, int pos){
		if(tag.equals(HTML.Tag.META)){
			handleMeta(attribs);
		}
	}

/**
 *		Opening tag
 */
	public void handleStartTag(HTML.Tag tag, MutableAttributeSet attribs, int pos){
		if(tag.equals(HTML.Tag.TITLE)){
			state  = TITLE;
		}
		if(tag.equals(HTML.Tag.A)){
			handleAnchor(attribs);
		}
	}

/**
 *		Closing tag
 */
	public void handleEndTag(HTML.Tag tag, int pos){
		if(state == NONE) return;
// In order of precedence == > && > ||
		if(state == TITLE && tag.equals(HTML.Tag.TITLE)){
			state  = NONE;
		}
		if(state == HREF && tag.equals(HTML.Tag.A)){
			links.add(linktext);
			state = NONE;
		}
	}

/**
 *		Handle page text
 */
	public void handleText(char[] text,  int pos){
		switch(state){
			case NONE:
				pagetext += new String(text)+" "; 
				break;
			case TITLE: 
				title = new String(text);
				break;
			case HREF: 
				linktext = new String(text);
				break;
		}
	}


/**
 *		Handle META tags
 */
	public void handleMeta(MutableAttributeSet attribs){
		String name = new String();
		String content = new String();
		name = (String)attribs.getAttribute(HTML.Attribute.NAME);
		content = (String)attribs.getAttribute(HTML.Attribute.CONTENT);
		if(name==null||content==null) return;
		name = name.toUpperCase();
		if(name.equals("DESCRIPTION")){
			description = content;
			return;
		}
		if(name.equals("KEYWORDS")){
			keywords = content;
			return;
		} 
	}

/**
 *		Handle Anchor <A HREF="~"> tags
 */
	public void handleAnchor(MutableAttributeSet attribs){
		String href = new String();
		href = (String)attribs.getAttribute(HTML.Attribute.HREF);
		if(href==null) return;
		links.add(href);
		state = HREF;	
	}


/**
 *		Return page title
 */
	public String getTitle(){
		return title.length()<100 ? depunct(title) : depunct(trim(title, 100));
			// expression ? op1 : op2
			// The ?: operator evaluates expression and returns op1 if it's true and op2 if it's false. 
	}

/**
 *		Return description (from META tags)
 */
	public String getDescription(){
		return description.length()<100 ? depunct(description) : depunct(trim(description,100));
	}

/**
 *		Return keywords (from META tags)
 */
	public String getKeywords(){
		return keywords.length()<100 ? depunct(keywords) : depunct(trim(keywords,100));
	}

/**
 *		Return pagetext
 */
	public String getPagetext(){
		return pagetext.length()<200 ? depunct(pagetext) : depunct(trim(pagetext,200));
	}

/**
 *		Return links
 */
	public Vector getLinks(){
		int size = links.size();
		if(size%2 != 0) links.add("filler");
	 		return links;
	}

	public String trim(String string, int cut){
		if(string.substring(cut , cut).equals(" ")||string.substring(cut+1 , cut+1).equals(" "))
			return string.substring(0, cut);
		return string.substring(0,string.substring(0,cut).lastIndexOf(" "));
	}

	public String depunct(String string){
		String depunct = new String();
		StringCharacterIterator iter = new StringCharacterIterator(string);
		for(char c = iter.first(); c != StringCharacterIterator.DONE; c = iter.next()){
			if(Character.isLetterOrDigit(c)){
				depunct += c;
			} else depunct += ' ';     
		} 
		return depunct;
	}
} 