///////////////////////////////////////////////////////////////////////////////
//                                                                           //
//  Notice to licensees:                                                     //
//                                                                           //
//  This source code is the exclusive, proprietary intellectual property of  //
//  Sharkysoft (sharkysoft.com).  You may view this source code as a         //
//  supplement to other product documentation, but you may not distribute    //
//  it or use it for any other purpose without written consent from          //
//  Sharkysoft.                                                              //
//                                                                           //
//  You are permitted to modify and recompile this source code, but you may  //
//  not remove this notice.  If you add features to or fix errors in this    //
//  code, please consider sharing your changes with Sharkysoft for possible  //
//  incorporation into future releases of the product.  Thanks!              //
//                                                                           //
//  For more information about Sharkysoft products and services, please      //
//  visit Sharkysoft on the web at                                           //
//                                                                           //
//       http://sharkysoft.com/                                              //
//                                                                           //
//  Thank you for using Lava!                                                //
//                                                                           //
///////////////////////////////////////////////////////////////////////////////



package lava.text.html;



import java.util.Stack;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import lava.clib.Ctype;
import lava.io.IoToolbox;
import lava.io.StreamParser;
import lava.io.UnreadBuffer;



/*******************************************************************************
Parses HTML source.

<p><b>Details:</b> This class parses HTML source by separating the source components into tags, text, and comments.  <code>HtmlParser</code> reads text from a <code>PushbackReader</code> and returns a stream of objects representing parsed entities.  Each of the objects is an instance of <code>HtmlComponent</code>, which has many subclasses (refer to the see-also section).</p>

<p>To gain an appreciation for the manner in which HtmlParser is able to parse and tokenize HTML source, the following sample program is provided.  Try this program on your favorite URL.</p>

<table border=1><tr><td><pre>
*import java.io.InputStreamReader;
*import java.io.OutputStreamWriter;
*import {@link lava.io lava.io}.{@link lava.io.UnlimitedPushbackReader UnlimitedPushbackReader};
*import {@link lava.io lava.io}.{@link lava.io.UrlInputStream UrlInputStream};
*import {@link lava.string lava.string}.{@link lava.string.StringEncoder StringEncoder};
*import {@link lava.text lava.text}.{@link lava.text.ParallelColumnsWriter ParallelColumnsWriter};
*import {@link lava.text.html lava.text.html}.{@link HtmlComponent HtmlComponent};
*import {@link lava.text.html lava.text.html}.{@link HtmlParser HtmlParser};
*
*
*
*class parseHtml
*{
*
*
*
*    public static void main (String[] args) throws Exception
*    {
*        if (args . length != 1)
*        {
*            System.out . println ("usage: parseHtml &lt;url&gt;");
*            return;
*        }
*        {@link HtmlParser HtmlParser} hp = new {@link HtmlParser HtmlParser}
*        (
*            new {@link lava.io.UnlimitedPushbackReader UnlimitedPushbackReader}
*            (
*                new InputStreamReader
*                (
*                    new {@link lava.io.UrlInputStream UrlInputStream} (args [0])
*                )
*            )
*        );
*        {@link lava.text.ParallelColumnsWriter ParallelColumnsWriter} pcw = new {@link lava.text.ParallelColumnsWriter ParallelColumnsWriter}
*        (
*            new OutputStreamWriter (System.out),
*            new int[] {15, 60}
*        );
*        while (true)
*        {
*            {@link HtmlComponent HtmlComponent} c = hp . {@link #parse() parse} ();
*            if (c == null)
*                break;
*            String clname = c . getClass () . getName ();
*            clname = clname . substring (clname . lastIndexOf ('.') + 1);
*            pcw . {@link java.text.ParallelColumnsWriter#writeln(java.lang.String[]) writeln}
*            (
*                new String[]
*                {
*                    clname,
*                    {@link lava.string.StringEncoder StringEncoder}.{@link lava.string.StringEncoder#encodeAsciiJavaString(java.lang.String) encodeAsciiJavaString} (c . {@link HtmlComponent#getSource() getSource} ())
*                }
*            );
*        }
*        pcw . {@link lava.text.ParallelColumnsWriter#close() close} ();
*        hp . {@link #close() close} ();
*    }
*
*
*
*}
</pre></td></tr></table>

<p>Click <a href=doc-files/parseHtml.java>here</a> to download source.</p>

<p><b>Changes:</b></p>

<dl>
	<dt>2002.04.18</dt>
		<dd>Values in name=value pairs may be enclosed in single quotes.</dd>
	<dt>2000.12.21
		<dd>Added <code>peek ()</code>.</dd>
</dl>

@see HtmlComponent
@see HtmlText
@see HtmlRegularTag
@see HtmlOpenTag
@see HtmlCloseTag
@see HtmlSpecialTag
@see HtmlComment
@see HtmlError

@version 2002.04.18
*******************************************************************************/

public class HtmlParser
{



	private final PushbackReader in;



	/**********************************************************************
	Sets HTML source.

	<p><b>Details:</b>  This constructor sets the <code>PushbackReader</code> from which this <code>HtmlParser</code> reads.</p>

	@param in the InputStream
	**********************************************************************/

	public HtmlParser (PushbackReader in)
	{
		this . in = in;
	}



	private Stack state_stack = new Stack ();



	private static final String PRE = "PRE";



	private static final String XMP = "XMP";



	private static final String SCRIPT = "SCRIPT";



	private HtmlRegularTag getState ()
	{
		if (state_stack . empty ())
			return null;
		return (HtmlRegularTag) state_stack . peek ();
	}



	private HtmlComponent nextComponent = null;



	private HtmlComponent parseNext () throws IOException
	{
		HtmlRegularTag state = getState ();
		if (state instanceof HtmlCloseTag)
		{
			state_stack . pop ();
			return state;
		}
		String state_type = state == null ? null : state . getType ();
		if (state_type == null || state_type . equals (PRE))
		{
			HtmlComponent c = parseRaw ();
			if (c instanceof HtmlOpenTag)
			{
				String type = ((HtmlRegularTag) c) . getType ();
				if
				(	type . equals (PRE)
				||	type . equals (XMP)
				||	type . equals (SCRIPT)
				)
					state_stack . push (c);
			}
			return c;
		}
		if
		(
			state_type . equals (XMP)
		||	state_type . equals (SCRIPT)
		)
		{
			HtmlText text = parseTextUntilTag (state_type);
			if (text == null)
				return parseNext ();
			return text;
		}
		throw new lava.UnreachableCodeException ("state_type=" + state_type);
	}



	/**********************************************************************
	Parses one HTML element.

	<p><b>Details:</b>  This method parses one element from the HTML source stream and returns it.  Use the <code>instanceof</code> operator to determine the type of element that was parsed.  <code>parse</code> returns <code>null</code> if no more elements can be parsed.</p>

	@return the parsed element
	@exception IOException if the source stream cannot be read

	@version 2000.12.21
	**********************************************************************/

	public HtmlComponent parse () throws IOException
	{
		if (nextComponent == null)
			nextComponent = parseNext ();
		HtmlComponent component = nextComponent;
		nextComponent = null;
		return component;
	}



	/**********************************************************************
	Peeks at next component without consuming.

	<p><b>Details:</b>  This method determines the next component without consuming it.  The object returned by this method is the same physical object that will be returned by <code>parse</code> the next time it is called.</p>

	@return the next component
	@exception IOException if an I/O error occurs

	@since 2000.12.21
	**********************************************************************/

	public HtmlComponent peek () throws IOException
	{
		if (nextComponent == null)
			nextComponent = parseNext ();
		return nextComponent;
	}



	private HtmlText parseTextUntilTag (String type) throws IOException
	{
		StringBuffer buff = new StringBuffer ();
		while (true)
		{
			int c = IoToolbox.peek (in);
			if (c == '<')
			{
				HtmlRegularTag tag = parseHtmlTag ();
				if ((tag instanceof HtmlCloseTag) && tag . getType () . equals (type))
				{
					state_stack . pop ();
					state_stack . push (tag);
					break;
				}
				if (tag != null)
				{
					buff . append (tag . getOriginalSource ());
					continue;
				}
			}
			in . read ();
			if (c < 0)
			{
				state_stack . push (null);
				break;
			}
			buff . append ((char) c);
		}
		if (buff . length () == 0)
			return null;
		HtmlText text = new HtmlText (buff . toString ());
		text . source = buff . toString ();
		return text;
	}



	private HtmlComponent parseRaw () throws IOException
	{
		while (true)
		{
			int c = IoToolbox.peek (in);
			if (c < 0)
			{
				return null;
			}
			if (c == '<')
			{
				HtmlComponent component;
				component = parseHtmlTag ();
				if (component != null)
					return component;
				component = parseHtmlComment ();
				if (component != null)
					return component;
				component = parseHtmlSpecialTag ();
				if (component != null)
					return component;
			}
			return parseHtmlText ();
		}
	}



	private void skipWhiteSpace (UnreadBuffer ub) throws IOException
	{
		ub . push (StreamParser.tryWhiteString (in));
	}



	private HtmlRegularTag parseHtmlTag () throws IOException
	{
		HtmlRegularTag tag = null;
		UnreadBuffer ub = new UnreadBuffer (in);
		int c = ub . peek ();
		if (c != '<')
			return null;
		ub . read ();
		skipWhiteSpace (ub);
		boolean end_tag = false;
		if (StreamParser.tryExactString (in, "/"))
		{
			end_tag = true;
			ub . push ("/");
			skipWhiteSpace (ub);
		}
		String typestr = StreamParser.tryHtmlIdentifier (in);
		ub . push (typestr);
		if (typestr == null)
		{
			ub . unreadAll ();
			return null;
		}
		if (! end_tag)
			tag = new HtmlOpenTag (typestr);
		else
			tag = new HtmlCloseTag (typestr);
		if (! end_tag)
			while (true)
			{
				// Read a name=value pair:
				// Read name:
				String name = readName (ub);
				if (name == null)
					break;
				// Read '=':
				String value = null;
				if (readEquals (ub))
				{
					// Read value:
					value = readValue (ub);
					if (value == null)
						continue;
				}
				((HtmlOpenTag) tag) . setAttribute (name, value);
			}
		// Skip until '>':
		while (true)
		{
			c = ub . read ();
			if (c < 0 || c == '>')
			{
				tag . source = ub . popAll ();
				return tag;
			}
			if (c == '<')
			{
				ub . unreadAll ();
				return null;
			}
		}
	}



	private String readName (UnreadBuffer ub) throws IOException
	{
		skipWhiteSpace (ub);
		int c = ub . peek ();
		if (c == '>' || c < 0)
			return null;
		String ji = StreamParser.tryHtmlIdentifier (in);
		ub . push (ji);
		return ji;
	}



	private boolean readEquals (UnreadBuffer ub) throws IOException
	{
		skipWhiteSpace (ub);
		int c = ub . read ();
		if (c != '=')
		{
			ub . unread ();
			return false;
		}
		return true;
	}



	private String readValue (UnreadBuffer ub) throws IOException
	{
		StringBuffer value = new StringBuffer ();
		skipWhiteSpace (ub);
		int c = ub . peek ();
		if (c == '>' || c < 0)
			return null;
		char vcQuote;
		switch (c)
		{
		case '\'':
		case '"':
			vcQuote = (char) c;
			ub . read ();
			break;
		default:
			vcQuote = 0;
		}
		while (true)
		{
			c = ub . read ();
			// Check for termination:
			if (c < 0)
				break;
			if (vcQuote > 0)
			{
				if (c == vcQuote)
					break;
			}
			else
			{
				if (c == '>' || Ctype.isspace (c))
				{
					ub . unread ();
					break;
				}
			}
			value . append ((char) c);
		}
		if (value . length () == 0 && vcQuote == 0)
			return null;
		return HtmlEntities.decodeHtmlText (value . toString ());
	}



	private HtmlText parseHtmlText () throws IOException
	{
		UnreadBuffer ub = new UnreadBuffer (in);
		StringBuffer text = new StringBuffer ();
		while (true)
		{
			int c = ub . read ();
			if ((c == '<' && text . length () > 0) || c < 0)
			{
				if (c >= 0)
					ub . unread ();
				if (text . length () == 0)
					return null;
				if (text . charAt (0) == '<')
				{
					HtmlError htmlerror = new HtmlError (text . toString ());
					htmlerror . source = ub . popAll ();
					return htmlerror;
				}
				HtmlText htmltext = new HtmlText (HtmlEntities.decodeHtmlText (text . toString ()));
				htmltext . source = ub . popAll ();
				return htmltext;
			}
			text . append ((char) c);
		}
	}



	private HtmlComment parseHtmlComment () throws IOException
	{
		if (! StreamParser.tryExactString (in, "<!--"))
			return null;
		UnreadBuffer ub = new UnreadBuffer (in);
		ub . push ("<!--");
		StringBuffer comment_text = new StringBuffer ();
		while (true)
		{
			if (StreamParser.tryExactString (in, "-->"))
			{
				ub . push ("-->");
				break;
			}
			int c = ub . read ();
			if (c < 0)
				break;
			comment_text . append ((char) c);
		}
		HtmlComment htmlcomment = new HtmlComment (comment_text . toString ());
		htmlcomment . source = ub . popAll ();
		return htmlcomment;
	}



	/**********************************************************************
	Short description.

	<p><b>Details:</b>  parseHtmlSpecialTag parses strange formes of HTML tags which do not match the form &lt; text name = value &gt;, &lt; / text &gt;, or &lt;!-- comment --&gt;.</p>

	@return the parsed tag
	@exception IOException if an I/O error occurs
	**********************************************************************/

	private HtmlSpecialTag parseHtmlSpecialTag () throws IOException
	{
		UnreadBuffer ub = new UnreadBuffer (in);
		if (ub . peek () != '<')
			return null;
		ub . read ();
		StringBuffer content = new StringBuffer ();
		while (true)
		{
			int c = ub . read ();
			if (c == '<')
			{
				ub . unreadAll ();
				return null;
			}
			if (c == '>' || c < 0)
				break;
			content . append ((char) c);
		}
		HtmlSpecialTag htmlspecialtag = new HtmlSpecialTag (content . toString ());
		htmlspecialtag . source = ub . popAll ();
		return htmlspecialtag;
	}



	/**********************************************************************
	Closes source input stream.

	<p><b>Details:</b>  This method closes the HTML source input stream.  Of course, no more HTML tokens can be parsed after this method is called.</p>

	@exception IOException if an I/O error occurs
	**********************************************************************/

	public void close () throws IOException
	{
		in . close ();
	}



	public static boolean isOpenTag (HtmlComponent c, String type)
	{
		return (c instanceof HtmlOpenTag) && ((HtmlOpenTag) c) . getType () . equals (type);
	}



	// added 12-22-2000
	public static boolean isCloseTag (HtmlComponent c, String type)
	{
		return (c instanceof HtmlCloseTag) && ((HtmlCloseTag) c) . getType () . equals (type);
	}



}



