|
| HTMLParser () |
|
void | handleText (final char[] data, final int pos) |
|
void | handleStartTag (final HTML.Tag tag, final MutableAttributeSet attrSet, final int pos) |
|
void | handleEndTag (final HTML.Tag tag, final int pos) |
|
String | parse (final URL url) throws IOException |
|
String | parse (final Reader r) throws IOException |
|
String | parse (final String text0) throws IOException |
|
String | title () |
|
|
static String | searchAndReplace (final String text, final String from0, final String to) |
|
static String | escapeString (final String s, final char[] charsToEscape, final char escapeChar) |
|
static String | slurpReader (final Reader reader) throws IOException |
|
static String | slurpURL (final URL u) throws IOException |
|
static void | main (final String[] args) throws IOException |
|
Parses an HTML document and returns the plain text (and title). The main thing that HTMLParser is used for is the parse(URL url)
method, which will return a sentence with the contents of an HTML page, without the tags. After calling parse, you can get the HTML title (contents of the TITLE tag) by calling title(). Subclasses may override the handleText(), handleComment(), handleStartTag(), etc. methods so that parse(URL url)
returns something other than the text of the web page. (For example, one may be interested in returning only part of the text, or only the links.)
- Author
- Sepandar Kamvar (sdkam.nosp@m.var@.nosp@m.stanf.nosp@m.ord..nosp@m.edu)
◆ HTMLParser()
grammarscope.utils.HTMLParser.HTMLParser |
( |
| ) |
|
◆ escapeString()
static String grammarscope.utils.HTMLParser.escapeString |
( |
final String |
s, |
|
|
final char[] |
charsToEscape, |
|
|
final char |
escapeChar |
|
) |
| |
|
static |
◆ handleEndTag()
void grammarscope.utils.HTMLParser.handleEndTag |
( |
final HTML.Tag |
tag, |
|
|
final int |
pos |
|
) |
| |
Sets a flag if the end tag is the "TITLE" element end tag
◆ handleStartTag()
void grammarscope.utils.HTMLParser.handleStartTag |
( |
final HTML.Tag |
tag, |
|
|
final MutableAttributeSet |
attrSet, |
|
|
final int |
pos |
|
) |
| |
Sets a flag if the start tag is the "TITLE" element start tag.
◆ handleText()
void grammarscope.utils.HTMLParser.handleText |
( |
final char[] |
data, |
|
|
final int |
pos |
|
) |
| |
◆ main()
static void grammarscope.utils.HTMLParser.main |
( |
final String[] |
args | ) |
throws IOException |
|
static |
◆ parse() [1/3]
String grammarscope.utils.HTMLParser.parse |
( |
final Reader |
r | ) |
throws IOException |
◆ parse() [2/3]
String grammarscope.utils.HTMLParser.parse |
( |
final String |
text0 | ) |
throws IOException |
The parse method that actually does the work. Now it first gets rid of singleton tags before running.
- Parameters
-
- Returns
- parsed string
- Exceptions
-
◆ parse() [3/3]
String grammarscope.utils.HTMLParser.parse |
( |
final URL |
url | ) |
throws IOException |
◆ searchAndReplace()
static String grammarscope.utils.HTMLParser.searchAndReplace |
( |
final String |
text, |
|
|
final String |
from0, |
|
|
final String |
to |
|
) |
| |
|
static |
◆ slurpReader()
static String grammarscope.utils.HTMLParser.slurpReader |
( |
final Reader |
reader | ) |
throws IOException |
|
static |
Returns all the text from the given Reader. Closes the Reader when done.
- Parameters
-
- Returns
- The text in the file.
- Exceptions
-
◆ slurpURL()
static String grammarscope.utils.HTMLParser.slurpURL |
( |
final URL |
u | ) |
throws IOException |
|
static |
Returns all the text at the given URL.
- Parameters
-
- Returns
- all the text at the given URL
- Exceptions
-
◆ title()
String grammarscope.utils.HTMLParser.title |
( |
| ) |
|
◆ isBody
boolean grammarscope.utils.HTMLParser.isBody |
|
protected |
◆ isScript
boolean grammarscope.utils.HTMLParser.isScript |
|
protected |
◆ isTitle
boolean grammarscope.utils.HTMLParser.isTitle |
|
protected |
◆ SLURP_BUFFER_SIZE
final int grammarscope.utils.HTMLParser.SLURP_BUFFER_SIZE = 16000 |
|
staticprivate |
◆ textBuffer
StringBuffer grammarscope.utils.HTMLParser.textBuffer |
|
protected |
◆ title
String grammarscope.utils.HTMLParser.title |
|
protected |
The documentation for this class was generated from the following file: