import java.io.*;
import java.util.*;


/**
 * This class is responsible for extracting data from HTML documents.
 * It uses a finite state machine model to parse HTML.
 *
 * @version $Id: HTMLParser_FSM.java,v 1.20 2002/09/11 13:22:25 howama Exp $
 * @author Ben Secrest &lt;blsecres@users.sourceforge.net&gt;
 */
public class HTMLParser_FSM implements FileParser, LinkExtractor {
    /** The default logging level for this module */
    private final static int LOGLEVEL = 9;

    /** The type of files this parser works with */
    private final static String fileType = "HTML";

    /** The default character encoding for HTML documents */
    private final static String defaultEncoding = "ISO-8859-1";

    /** File extensions for HTML files */
    private final static String[] extensions = {"html", "htm", "shtml",
	"xhtml"};

    /** Mime types for HTML files */
    private final static String[] mimeTypes = {"text/html"};

    /** HTML file magic signature */
    private final static byte[][] magic = {{(byte) '<', (byte) 'h', (byte) 't',
	(byte) 'm', (byte) 'l'}};

    /** HTML headers aren't always at the beginning of the file */
    private final static boolean magicOffset = true;

    /** HTML headers aren't case sensitive */
    private final static boolean magicCase = false;

    /** HTML FileMagic structure */
    private final static FileMagic htmlMagic = new FileMagic(magic, magicOffset,
	    magicCase);

    /**
     * The buffer size to mark for reading character encoding.  This value was
     * extracted from src/java/io/BufferedReader.java
     * @see java.io.BufferedReader
     */
    private final static int BUFFER_SIZE = 8192;

    /*
     * encodings for FSM operations
     */
    /**
     * Instruct the FSM to extract document attributes and possibly hyperlinks
     */
    private final static byte GET_ATTRIBUTES	= 0x01;
    /** Instruct the FSM to extract the documents character encoding */
    private final static byte GET_ENCODING	= 0x02;


    /*
     * encodings for parser state machine
     */
    private final static byte MAIN		= 0x00;
    private final static byte OPEN_TAG_START	= 0x01;
    private final static byte OPEN_TAG_END	= 0x02;
    private final static byte READ_CDATA	= 0x03;
    private final static byte NEW_TAG_START	= 0x04;
    private final static byte CLOSE_TAG_START	= 0x05;
    private final static byte CLOSE_TAG_END	= 0x06;
    private final static byte META_TAG		= 0x07;
    private final static byte TAG_ATTRIB	= 0x08;
    private final static byte TAG_VALUE_START	= 0x09;
    private final static byte TAG_VALUE_END	= 0x0a;
    private final static byte HEAD_END		= 0x0b;
    private final static byte XML_DECL		= 0x0c;
    private final static byte LINK		= 0x0d;
    private final static byte HREF		= 0x0e;
    private final static byte URL_START		= 0x0f;
    private final static byte URL_END		= 0x10;

    // encodings for meta tag type
    private final static byte NO_META		= 0x00;
    private final static byte META_MASK		= 0x1f;

    // meta tags for attribute parser
    private final static byte AUTHOR		= 0x01;
    private final static byte DESCR		= 0x02;
    private final static byte KEYWORDS		= 0x04;

    // meta tags for charset parser
    private final static byte CHARSET		= 0x08;
    private final static byte CONTENT_TYPE	= 0x10;

    // content distinguishers
    private final static byte NAME		= 0x20;
    private final static byte CONTENT		= 0x40;

    /** Exception for bad character encodings */
    private class BadEncodingException extends Exception { }

    /** Determines if this parser will search for title */
    private boolean wantTitle;

    /** Determines if this parser will search for author */
    private boolean wantAuthor;

    /** Determines if this parser will search for description */
    private boolean wantDescription;

    /** Determines if this parser will search for keywords */
    private boolean wantKeywords;

    /** Determines if the parser will provide file type */
    private boolean wantFileType;

    /** Determines if the parser will provide parser information */
    private boolean wantParser;

    /** Determins if this parser will search for hyperlinks */
    private boolean wantURLs;

    /** A document's URL's */
    private HashSet hyperlinks;

    /** The logging object for this module */
    private IGLog log;



    /**
     * Constructor a new HTMLParser.
     */
    public HTMLParser_FSM() {
	log = null;

	wantTitle = wantAuthor = wantDescription = wantKeywords = wantURLs
	    = wantFileType = wantParser = false;
    }


    /**
     * Set the desired attributes to extract
     * @param wanted A set of bits describing preferences
     */
    public void setWantedItems(IGKeySet wanted) {
	wantTitle = wanted.wants(IGKey.TITLE);
	wantAuthor = wanted.wants(IGKey.AUTHOR);
	wantDescription = wanted.wants(IGKey.DESCRIPTION);
	wantKeywords = wanted.wants(IGKey.KEYWORDS);
	wantFileType = wanted.wants(IGKey.FILE_TYPE);
	wantParser = wanted.wants(IGKey.PARSER);
    }


    /**
     * Set the logger to use with this parser
     * @param logObj The object to use for logging data
     */
    public void setLog(IGLog logObj) {
	log = logObj;
    }


    /**
     * Fill file structure with data from opened input stream
     * @param file The IGFile structure to fill in
     * @param stream The stream to read data from
     * @throws IOException if an error occurs reading data from the stream
     */
    public void parse(IGFile file, InputStream stream)
	    throws IOException, StreamResetException {
	if (log == null)
	    // FIXME
	    return;

	String encoding = file.getString(IGKey.FILE_ENCODING);
	BufferedReader reader = new BufferedReader(
		new InputStreamReader(stream, (encoding == null
		    ? defaultEncoding : encoding)));

	try {
	    runFSM(file, reader, GET_ATTRIBUTES);
	} catch (BadEncodingException bee) {
	    try {
		reader.reset();
	    } catch (IOException ioe) {
		throw new StreamResetException();
	    }

	    reader = new BufferedReader(new InputStreamReader(stream,
		    file.getString(IGKey.FILE_ENCODING)));

	    /* this can't be thrown twice */
	    try {
		runFSM(file, reader, GET_ATTRIBUTES);
	    } catch (BadEncodingException bee2) {
		throw new IOException("This shouldn't happen");
	    }
	}

	reader.close();
    }


    /**
     * Extract required attributes from an HTML document
     * @param file The IGFile object to fill with data
     */
    public void parse(IGFile file) throws IOException, FileNotFoundException {
	if (log == null)
	    // FIXME
	    return;

	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "HTMLParser_FSM.parse(IGFile)");

	if (LOGLEVEL >= IGLog.FILE)
	    log.addResource(IGLog.FILE, "PROCESS_FILE",
		    new String[]{file.getLocation()});

	BufferedReader reader = new BufferedReader(new InputStreamReader(
		    new FileInputStream(file.getLocation()),
		    defaultEncoding));

	if (LOGLEVEL >= IGLog.FILE)
	    log.addResource(IGLog.FILE, "HTML_EXTRACT", null);

	try {
	    runFSM(file, reader, GET_ATTRIBUTES);
	} catch (BadEncodingException bee) {
	    reader.close();
	    reader = new BufferedReader(new InputStreamReader(
			new FileInputStream(file.getLocation()),
			    file.getString(IGKey.FILE_ENCODING)));
	    try {
		runFSM(file, reader, GET_ATTRIBUTES);
	    } catch (BadEncodingException bee2) {
		throw new IOException("This shouldn't happen");
	    } catch (StreamResetException sre) {
		throw new IOException("This shouldn't happen");
	    }
	} catch (StreamResetException sre) {
	    reader = new BufferedReader(new InputStreamReader(
			new FileInputStream(file.getLocation()),
			    file.getString(IGKey.FILE_ENCODING)));
	    try {
		runFSM(file, reader, GET_ATTRIBUTES);
	    } catch (BadEncodingException bee3) {
		throw new IOException("This shouldn't happen");
	    } catch (StreamResetException sre2) {
		throw new IOException("This shouldn't happen");
	    }
	}

	reader.close();
    }


    /**
     * Function to drive the HTML parser.  The parser can extract several
     * different types of information.  The type is controlled by the operation
     * parameter.  The document attributes (title, author...) can be retrieved
     * with an operation of GET_ATTRIBUTES.  The document's character encoding
     * can be retrieved with GET_ENCODING.
     *
     * @param file The IGFile structure to fill
     * @param reader The reader object to parse data from
     * @param operation The function to perform on the data
     * @throws IOException if an error occurs reading data
     * @throws StreamReopenException if the input stream requires resetting
     * @see #GET_ATTRIBUTES
     * @see #GET_ENCODING
     */
    private void runFSM(IGFile file, BufferedReader reader, byte operation)
	    throws IOException, BadEncodingException, StreamResetException {
	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "HTMLParser_FSM.runFSM(Reader, byte)");

	String title = null;
	String author = null;
	String description = null;
	String keywords = null;
	String characterEncoding = null;
	HashSet hyperlinks = null;

	if (operation == GET_ATTRIBUTES) {
	    characterEncoding = file.getString(IGKey.FILE_ENCODING);

	    if (characterEncoding == null) {
		// mark the position in the stream for reset
		reader.mark(BUFFER_SIZE);

		// no character encoding for file, extract it
		runFSM(file, reader, GET_ENCODING);

		characterEncoding = file.getString(IGKey.FILE_ENCODING);

		if (LOGLEVEL >= IGLog.RESULT)
		    log.addResource(IGLog.RESULT, "HTML_FIND_CHARSET",
			new String[]{characterEncoding});

		// attempt to reset the file to the mark made at the beginning
		try {
		    reader.reset();
		} catch (IOException ioe) {
			log.addError(-1, "HTML_RESET_ERROR",
				new String[]{ioe.getMessage()});

		    if (LOGLEVEL >= IGLog.FILE)
			log.addResource(IGLog.FILE, "HTML_REOPEN_FILE",
				null);

		    reader.close();
		    /*
		     * couldn't reset the file to the beginning, reopen the
		     * stream
		     */
		    throw new StreamResetException();
		}

		/*
		 * if the character set for the document doesn't match the
		 * default character set, throw an exception to have the stream
		 * encoding reset
		 */
		if (characterEncoding != null &&
			! characterEncoding.equalsIgnoreCase(defaultEncoding)) {
			log.addError(-1, "HTML_WRONG_CHARSET",
				null);

		    throw new BadEncodingException();
		}
	    }


	    /*
	     * if not searching for an item, assign it a value so the search
	     * can kick out when all items != null
	     */
	    title = (wantTitle ? null : "");
	    author = (wantAuthor ? null : "");
	    description = (wantDescription ? null : "");
	    keywords = (wantKeywords ? null : "");
	    hyperlinks = (wantURLs ? new HashSet() : null);
	} else if (operation == GET_ENCODING) {
	    if (LOGLEVEL >= IGLog.FILE)
		log.addResource(IGLog.FILE, "HTML_CHECK_CHARSET", null);

	    characterEncoding = null;
	} else {
	    return;
	}


	StreamTokenizer tokenizer = new StreamTokenizer(reader);

	// configure the tokenizer
	tokenizer.eolIsSignificant(false);
	tokenizer.lowerCaseMode(false);		// ! convert words to lower
	tokenizer.ordinaryChar('"');		// ! use quote feature
	tokenizer.ordinaryChar('.');
	tokenizer.ordinaryChar('/');		// ! proc C-style comments
	tokenizer.ordinaryChar('\'');		// "                 "
	tokenizer.ordinaryChars('0', '9');	// ! process numbers
	tokenizer.slashSlashComments(false);	// ! process comments
	tokenizer.slashStarComments(false);	// "                "
	tokenizer.wordChars('#', '&');		// everything except HTML
	tokenizer.wordChars('(', '.');		//   tag delimiters are
	tokenizer.wordChars('0', ';');		//   considered word characters
	tokenizer.wordChars('?', '[');
	tokenizer.wordChars(']', '~');

	boolean isSearching = true;	// still searching for data
	boolean isPassedHead = false;	// left document <head></head>
	byte curState = MAIN;		// current FSM state
	byte nextState = MAIN;		// next FSM state
	byte curMeta = NO_META;		// current meta tag being parsed
	String curValue = "";		// current CDATA/attribute value
	String curTag = "";		// current tag being parsed


	if (LOGLEVEL >= IGLog.SECTION)
	    log.addResource(IGLog.SECTION, "FP_BEGIN_PARSE",
		    new String[]{fileType});

	while (isSearching &&
	       (tokenizer.nextToken() != StreamTokenizer.TT_EOF)) {

	//    if (LOGLEVEL >= IGLog.DEBUG)
	//	log.add(IGLog.DEBUG, "State: " + curState + ' ' +
	//		tokenizer.toString());

	    switch (curState) {
	    /*
	     * main state, consume tokens until the start of an HTML
	     * tag is found
	     */
	    case MAIN :
		if (tokenizer.ttype == '<')
		    nextState = OPEN_TAG_START;
		break;
	    /*
	     * an html tag has been started, examine its type and
	     * transition accordingly
	     */
     	    case OPEN_TAG_START :
		switch (tokenizer.ttype) {
		case StreamTokenizer.TT_WORD :
		    if (wantURLs && operation == GET_ATTRIBUTES
			    && (tokenizer.sval.equalsIgnoreCase("a")
				|| tokenizer.sval.equalsIgnoreCase("frame"))) {
			nextState = LINK;
		    } else if (! isPassedHead) {
			if (operation == GET_ATTRIBUTES && title == null
				&& tokenizer.sval.equalsIgnoreCase("title")) {
			    nextState = OPEN_TAG_END;
			    curTag = tokenizer.sval.toLowerCase();
			    curValue = "";
			} else if (tokenizer.sval.equalsIgnoreCase("meta")) {
			    nextState = META_TAG;
			} else {
			    nextState = MAIN;
			}
		    } else /* isPassedHead */ {
			/*
			 * passed head, still a chance of finding a description
			 * in a <p> or a title in a <h#>
			 */
			if (operation == GET_ATTRIBUTES) {
			    if ((description == null
					&& tokenizer.sval.equalsIgnoreCase("p"))
				    || (title == null
					&& (tokenizer.sval.substring(0,
						1).equalsIgnoreCase("h")
					    && (tokenizer.sval.charAt(1) >= '1'
						&& tokenizer.sval.charAt(1)
						<= '5')))) {
				nextState = OPEN_TAG_END;
				curTag = tokenizer.sval.toLowerCase();
				curValue = "";
			    } else {
				nextState = MAIN;
			    }
			}
		    }
		    break;
		case '?' :
		    if (operation == GET_ENCODING)
			nextState = XML_DECL;
		    else
			nextState = MAIN;
		    break;
		case '/' :
		    if (! isPassedHead)
			nextState = HEAD_END;
		    else
			nextState = MAIN;
		    break;
		default :
		}
		break;
	    /*
	     * confirm <? starts a XML declaration
	     */
	    case XML_DECL :
		// treat XML declaration like a meta tag
		if (tokenizer.ttype == StreamTokenizer.TT_WORD
			&& tokenizer.sval.equalsIgnoreCase("xml"))
		    nextState = META_TAG;
		else
		    nextState = MAIN;
		break;
	    /*
	     * check for the end of the document <head>, once passed the head,
	     * there's no point in search for a character set
	     */
	    case HEAD_END :
		if (tokenizer.ttype == StreamTokenizer.TT_WORD
			&& tokenizer.sval.equalsIgnoreCase("head")) {
		    isPassedHead = true;

		    if (operation == GET_ATTRIBUTES) {
			/*
			 * were in <head>, not now, keywords and author
			 * weren't found
			 */
			if (keywords == null)
			    keywords = "";
			if (author == null)
			    author = "";
		    } else if (operation == GET_ENCODING) {
			isSearching = false;
		    }
		}

		nextState = MAIN;
		break;
	    /*
	     * a tag the parser is interested in has been started,
	     * consume any attributes until the end of the tag
	     */
     	    case OPEN_TAG_END :
		if (tokenizer.ttype == '>')
		    nextState = READ_CDATA;
		break;
	    /*
	     * inside an open/close set of tags, read CDATA and append
	     * it to a string, check for possible end tags
	     */
     	    case READ_CDATA :
		switch (tokenizer.ttype) {
		case '<' :
		    nextState = NEW_TAG_START;
		    break;
		case StreamTokenizer.TT_WORD :
		    curValue += tokenizer.sval + ' ';
		    break;
		default :
		    curValue += new Character((char) tokenizer.ttype);
		    break;
		}
		break;
	    /*
	     * start of a new tag in the data stream, check if it's a
	     * closing tag
	     */
     	    case NEW_TAG_START :
		if (tokenizer.ttype == '/')
		    nextState = CLOSE_TAG_START;
		else {
		    // add back read data
		    nextState = READ_CDATA;
		}
		break;
	    /*
	     * new tag was a closing tag, see if it matches the
	     * opening tag, if so, add collected data to proper
	     * variable
	     */
     	    case CLOSE_TAG_START :
		if (tokenizer.ttype == StreamTokenizer.TT_WORD
			&& tokenizer.sval.equalsIgnoreCase(curTag)) {
		    if (curTag.equals("title")) {
			title = curValue.trim();
			if (LOGLEVEL >= IGLog.PROGRESS)
			    log.addResource(IGLog.PROGRESS, "FP_FOUND_TITLE",
				    new String[]{title});
		    } else if (curTag.equals("p")) {
			description = curValue.trim();
			if (LOGLEVEL >= IGLog.PROGRESS)
			    log.addResource(IGLog.PROGRESS, "FP_FOUND_DESC",
				    new String[]{description});
		    } else if (curTag.charAt(0) == 'h') {
			title = curValue.trim();
			if (LOGLEVEL >= IGLog.PROGRESS)
			    log.addResource(IGLog.PROGRESS, "FP_FOUND_TITLE",
				    new String[]{title});
		    }

		    nextState = CLOSE_TAG_END;
		} else {
		    // add back read data
		    nextState = READ_CDATA;
		}
		break;
	    /*
	     * consume until tag is ended
	     */
     	    case CLOSE_TAG_END :
		if (tokenizer.ttype == '>')
		    nextState = MAIN;
		break;
	    /*
	     * a meta tag was encountered, search for either name or
	     * content attributes, keep track of which was found, if meta
	     * tag ends, place collected data in proper variables
	     */
     	    case META_TAG :
		switch (tokenizer.ttype) {
		case StreamTokenizer.TT_WORD :	// attribute
		    if (tokenizer.sval.equalsIgnoreCase("name")
			    || tokenizer.sval.equalsIgnoreCase("http-equiv")) {
			curMeta |= NAME;
			nextState = TAG_ATTRIB;
		    } else if (tokenizer.sval.equalsIgnoreCase("content")) {
			curMeta |= CONTENT;
			nextState = TAG_ATTRIB;
		    } else if (operation == GET_ENCODING &&
			       tokenizer.sval.equalsIgnoreCase("encoding")) {
			curMeta = CHARSET;
			nextState = TAG_ATTRIB;
		    }
		    break;
		case '<' :	// End of Tag
		case '>' :
		case '/' :
		case '?' :
		    if (operation == GET_ATTRIBUTES) {
			switch (curMeta & META_MASK) {
			case AUTHOR :
			    if (author == null) {
				author = curValue;
				if (LOGLEVEL >= IGLog.PROGRESS)
				    log.addResource(IGLog.PROGRESS,
					    "FP_FOUND_AUTHOR",
					    new String[]{author});
			    }
			    break;
			case DESCR :
			    if (description == null) {
				description = curValue;
				if (LOGLEVEL >= IGLog.PROGRESS)
				    log.addResource(IGLog.PROGRESS,
					    "FP_FOUND_DESC",
					    new String[]{description});
			    }
			    break;
			case KEYWORDS :
			    if (keywords == null) {
				keywords = curValue;
				if (LOGLEVEL >= IGLog.PROGRESS)
				    log.addResource(IGLog.PROGRESS,
					    "FP_FOUND_KEYWORDS",
					    new String[]{keywords});
			    }
			    break;
			}
		    } else if (operation == GET_ENCODING
			    && (curMeta & META_MASK) == CONTENT_TYPE) {
			characterEncoding = extractCharset(curValue);
			if (LOGLEVEL >= IGLog.PROGRESS)
			    log.addResource(IGLog.PROGRESS, "HTML_FIND_CHARSET",
				    new String[]{characterEncoding});
		    }
		    /* FALLTHROUGH */
		default :
		    curMeta = NO_META;
		    nextState = MAIN;
		    curValue = "";
		    break;
		}
		break;
	    /*
	     * search for = sign for attribute/value pair
	     */
     	    case TAG_ATTRIB :
		switch (tokenizer.ttype) {
		case '=' :
		    nextState = TAG_VALUE_START;
		    break;
		case '>' :
		case '/' :
		case '?' :
		case '<' :
		    /*
		     * close of meta tag, push back tag and let META_TAG state
		     * handle it
		     */
		    tokenizer.pushBack();
		    /* FALLTHROUGH */
		default :
		    nextState = META_TAG;
		    break;
		}
		break;
	    /*
	     * read in value from attribute/value pair, if it's the
	     * pair's name keep track of which type, otherwise store
	     * data
	     */
     	    case TAG_VALUE_START :
		switch (tokenizer.ttype) {
		case '"' :
		case '\'' :
		    curValue = "";
		    nextState = TAG_VALUE_END;
		    break;
		case StreamTokenizer.TT_WORD :
		    if ((curMeta & NAME) != 0) {
			curMeta |= getMetaType(tokenizer.sval);
			curMeta &= ~NAME;
		    } else if ((curMeta & CONTENT) != 0) {
			curValue = tokenizer.sval;
			curMeta &= ~CONTENT;
		    }
		    nextState = META_TAG;
		    break;
		default :
		    /*
		     * read something other than a quoted string or an unquoted
		     * value, let META_TAG state handle it
		     */
		    tokenizer.pushBack();
		    nextState = META_TAG;
		    break;
		}
		break;
	    /*
	     * reading data from a quoted attribute's value, search for ending
	     * quote character
	     */
	    case TAG_VALUE_END :
		switch (tokenizer.ttype) {
		case StreamTokenizer.TT_WORD :
		    curValue += tokenizer.sval + ' ';
		    break;
		case '"' :
		case '\'' :
		    if ((curMeta & NAME) != 0) {
			curMeta |= getMetaType(curValue.trim());
			curMeta &= ~NAME;
		    } else if ((curMeta & CONTENT) != 0) {
			curMeta &= ~CONTENT;
		    }
		    nextState = META_TAG;
		    break;
		case '>' :
		case '<' :
		    /*
		     * unterminated string value, return to META_TAG and
		     * process
		     */
		    tokenizer.pushBack();
		    nextState = META_TAG;
		    break;
		default :
		    curValue += new Character((char) tokenizer.ttype);
		    break;
		}
		break;
	    case LINK :
		switch (tokenizer.ttype) {
		case StreamTokenizer.TT_WORD :
		    if (tokenizer.sval.equalsIgnoreCase("href")
			    || tokenizer.sval.equalsIgnoreCase("src"))
			nextState = HREF;
		    break;
		case '<' :
		case '>' :
		case '/' :
		    nextState = MAIN;
		    break;
		}
		break;
	    case HREF :
		if (tokenizer.ttype == '=') {
		    nextState = URL_START;
		    curValue = "";
		} else
		    nextState = MAIN;
		break;
	    case URL_START :
		switch (tokenizer.ttype) {
		case '>' :
		case '<' :
		    if (LOGLEVEL >= IGLog.PROGRESS)
			log.addResource(IGLog.PROGRESS, "EXTRACT_HYPERLINK",
				new String[]{curValue});
		    hyperlinks.add(curValue);
		    curValue = "";
		    nextState = MAIN;
		    break;
		case '"' :
		case '\'' :
		    curValue = "";
		    nextState = URL_END;
		    break;
		case StreamTokenizer.TT_WORD :
		    curValue += tokenizer.sval;
		    break;
		default :
		    curValue += new Character((char) tokenizer.ttype);
		    break;
		}
		break;
	    case URL_END :
		switch (tokenizer.ttype) {
		case '>' :
		case '<' :
		case '"' :
		case '\'' :
		    if (LOGLEVEL >= IGLog.PROGRESS)
			log.addResource(IGLog.PROGRESS, "EXTRACT_HYPERLINK",
				new String[]{curValue});
		    hyperlinks.add(curValue);
		    curValue = "";
		    nextState = MAIN;
		    break;
		case StreamTokenizer.TT_WORD :
		    curValue += tokenizer.sval;
		    break;
		default :
		    curValue += new Character((char) tokenizer.ttype);
		    break;
		}
		break;
	    }

	    curState = nextState;

	    if ((operation == GET_ATTRIBUTES && ! wantURLs && title != null
			&& author != null && description != null
			&& keywords != null)
		    || (operation == GET_ENCODING && characterEncoding != null))
		isSearching = false;
	}

	if (LOGLEVEL >= IGLog.SECTION)
	    log.addResource(IGLog.SECTION, "FP_FINISH_PARSE",
		    new String[]{fileType});

	if (operation == GET_ATTRIBUTES) {
	    if (wantTitle)
		file.put(IGKey.TITLE, title);
	    if (wantAuthor)
		file.put(IGKey.AUTHOR, author);
	    if (wantDescription)
		file.put(IGKey.DESCRIPTION, description);
	    if (wantKeywords)
		file.put(IGKey.KEYWORDS, keywords);
	    if (wantFileType)
		file.put(IGKey.FILE_TYPE, fileType);
	    if (wantParser)
		file.put(IGKey.PARSER, getClass().getName());

	    /*
	     * store URLs in the IGFile so they can be extracted by other
	     * routines
	     */
	    if (wantURLs) {
		file.put(IGKey.URLS, hyperlinks);
		hyperlinks = null;
	    }
	} else if (operation == GET_ENCODING) {
	    file.put(IGKey.FILE_ENCODING, (characterEncoding != null
			?  characterEncoding : defaultEncoding));
	}
    }


    /**
     * Determine which meta tag has been encountered
     * @param name The value of the meta tag's name attribute
     * @return The value representing the meta type
     */
    private byte getMetaType(String name) {
	if (name.equalsIgnoreCase("author"))
	    return AUTHOR;
	else if (name.equalsIgnoreCase("description"))
	    return DESCR;
	else if (name.equalsIgnoreCase("keywords"))
	    return KEYWORDS;
	else if (name.equalsIgnoreCase("content-type"))
	    return CONTENT_TYPE;
	else
	    return NO_META;
    }


    /**
     * Extract URL's from an HTML document
     * @param file The file to get URL's from
     * @return An array of URL's as String's
     */
    public String[] getLinks(IGFile file) {
	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "HTMLParser_FSM.getLinks(IGFile["
		    + file.getLocation() + "])");
	HashSet urls = (HashSet) file.get(IGKey.URLS);
	file.remove(IGKey.URLS);
	return IGMisc.hashSetToStringArray(urls);
    }


    /**
     * Instruct the parser whether or not links should be collected
     * @param pref If the preference is <tt>true</tt>, links will be collected.
     * 	If <tt>false</tt>, no links will be collected.
     */
    public void wantURLs(boolean pref) {
	wantURLs = pref;
    }


    /**
     * Supply extensions this parser can handle
     * @return String array of file extensions
     */
    public String[] getExtensions() {
	return extensions;
    }


    /**
     * Supply mime types this parser can handle
     * @return String array of mime types
     */
    public String[] getMimeTypes() {
	return mimeTypes;
    }


    /**
     * Supply file magic for files this parser can handle
     * @return Array of byte arrays containing magic signature
     */
    public FileMagic getMagic() {
	return htmlMagic;
    }


    /**
     * Extract the character set from a Content-Type string
     * @param contentType The Content-type data string
     */
    private static String extractCharset(String contentType) {
        int start = contentType.toLowerCase().indexOf("charset");
        if (start != -1) {
            start = contentType.indexOf('=', start);
            int stop = contentType.indexOf(' ', start + 1);
            return(contentType.substring(start + 1,
                                         (stop != -1 ? stop :
                                          contentType.length())).trim());
        }

        return null;
    }
}
