|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectpt.tumba.parser.HTMLParser
public class HTMLParser
Parser to extract metadata from HTML files
Field Summary | |
---|---|
private java.lang.StringBuffer |
anchorText
|
private int |
anotation
|
private java.lang.String |
base
The base address used to resolve links |
private Content |
content
|
private java.lang.String[][] |
escape
|
private java.util.Map |
escapeMap
|
private boolean |
followRedirects
|
private HyperLinks |
hyperlinks
|
private ImageLinks |
images
|
private java.io.InputStream |
input
An input stream for the document |
private java.lang.String |
language
|
private boolean |
languageComputation
|
private pt.tumba.ngram.LanguageClass |
languageModels
|
private java.util.List |
lastFormHidden
|
private java.util.List |
lastFormSelect
|
private java.lang.String |
lastFormURL
|
private java.lang.String |
lastSelectName
|
private java.lang.String |
lastSubmit
|
private java.net.URL |
link
|
private HTMLMarkup |
markup
|
private int |
maxTerms
|
private MetaData |
metadata
|
private java.lang.String |
modelsPath
|
private java.lang.String |
name
The url or the filename of the document being parsed |
private int |
nextChar
Internal storage for the current character read from the document |
private java.lang.Object |
nextToken
Internal storage for the tokens parsed from the document |
private java.lang.StringBuffer |
output
Temporary internal storage for the document text |
private int |
position
|
private boolean |
printLine
Flag to indicate the last caracter on the output is a linefeed |
private boolean |
printSpace
Flag to indicate the last caracter on the output is a space |
private pt.tumba.ngram.EntryProfile |
profile
|
private int |
redirects
Number of folowed redirects (only 2 allowed) |
private boolean |
tagBreak
Flag to indicate that after the last token there was a tag break |
private int |
tagsAnchor
|
private int |
tagsBig
|
private int |
tagsBold
|
private int |
tagsEmphasize
|
private int |
tagsHeading1
|
private int |
tagsHeading2
|
private int |
tagsHeading3
|
private int |
tagsHeading4
|
private int |
tagsHeading5
|
private int |
tagsHeading6
|
private int |
tagsItalic
|
private int |
tagsSmall
|
private int |
tagsStrong
|
private int |
tagsTitle
|
private boolean |
useEncoding
|
private int |
wgramcount
|
private boolean |
withinIgnore
Flag to indicate the current token is within HTML comments or javascript code |
Constructor Summary | |
---|---|
HTMLParser()
Constructor for the HTMLParser object |
|
HTMLParser(java.lang.String modelsPath)
Constructor for the HTMLParser object |
Method Summary | |
---|---|
private boolean |
advanceScanner()
Description of the Method |
private java.lang.String |
analyseTagAux(java.lang.String tag,
java.lang.String key)
Description of the Method |
private boolean |
checkPath(java.lang.String type)
Description of the Method |
private void |
computeLanguageProfile()
|
java.lang.String |
getBase()
Description of the Method |
Content |
getContent()
|
HTMLMarkup |
getHTMLMarkup()
|
HyperLinks |
getHyperLinks()
|
ImageLinks |
getImages()
|
java.lang.String |
getLanguage()
Description of the Method |
java.util.Iterator |
getLinks(java.lang.String word)
Gets the sentence positions attribute of the HTMLParser object |
MetaData |
getMetaData()
|
java.lang.String |
getName()
Description of the Method |
private java.lang.String |
getNextToken()
Description of the Method |
double |
getNGramRank(java.lang.String ngram)
Gets the nGramRank attribute of the HTMLParser object |
java.util.Iterator |
getNGrams()
Description of the Method |
java.util.Iterator |
getStems()
|
java.util.Iterator |
getURLs(java.lang.String word)
Description of the Method |
private java.lang.String |
guessLanguage()
Description of the Method |
void |
initTokenizer(java.io.File input)
Description of the Method |
void |
initTokenizer(java.io.File input,
java.lang.String encoding)
Description of the Method |
void |
initTokenizer(java.io.File input,
java.lang.String encoding,
java.net.URL base)
Description of the Method |
void |
initTokenizer(java.io.File input,
java.net.URL base)
Description of the Method |
void |
initTokenizer(java.io.InputStream input)
Description of the Method |
private void |
initTokenizer(java.io.InputStream input,
boolean encoding)
Description of the Method |
void |
initTokenizer(java.io.InputStream input,
java.lang.String encoding)
Description of the Method |
void |
initTokenizer(java.io.InputStream input,
java.lang.String encoding,
java.net.URL base)
Description of the Method |
void |
initTokenizer(java.io.InputStream input,
java.net.URL base)
Description of the Method |
void |
initTokenizer(java.net.URL input)
Description of the Method |
void |
initTokenizer(java.net.URL input,
int redirects)
|
void |
initTokenizer(java.net.URL input,
java.lang.String encoding)
Description of the Method |
void |
initTokenizer(java.net.URL input,
java.lang.String encoding,
int redirects)
Description of the Method |
void |
loadLanguageModels(java.lang.String modelsPath)
Description of the Method |
void |
processData()
Description of the Method |
private boolean |
processMetaTags(java.lang.String name2,
java.lang.String value2)
Description of the Method |
private void |
processOthers()
Description of the Method |
private void |
processSpaces(boolean line)
Description of the Method |
private void |
processSpecialChar()
Description of the Method |
private boolean |
processTag()
Description of the Method |
private int |
removeSpecialChars(java.lang.String s)
Description of the Method |
private java.lang.String |
removeSpecialChars2(java.lang.String s)
Description of the Method |
void |
setMaxTerms(int max)
|
private void |
updateAnnotationCount(java.lang.String term,
int annotation)
Updates the internal count regarding the HTML markup information for specific terms |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
private pt.tumba.ngram.EntryProfile profile
private pt.tumba.ngram.LanguageClass languageModels
private java.lang.StringBuffer anchorText
private int tagsAnchor
private int tagsBig
private int tagsBold
private int tagsEmphasize
private int tagsHeading1
private int tagsHeading2
private int tagsHeading3
private int tagsHeading4
private int tagsHeading5
private int tagsHeading6
private int tagsItalic
private int tagsSmall
private int tagsStrong
private int tagsTitle
private int anotation
private java.lang.String base
private java.lang.String[][] escape
private java.util.Map escapeMap
private boolean followRedirects
private boolean useEncoding
private java.io.InputStream input
private java.lang.String language
private boolean languageComputation
private java.util.List lastFormHidden
private java.util.List lastFormSelect
private java.lang.String lastFormURL
private java.lang.String lastSelectName
private java.lang.String lastSubmit
private java.net.URL link
private int maxTerms
private java.lang.String modelsPath
private java.lang.String name
private int nextChar
private java.lang.Object nextToken
private java.lang.StringBuffer output
private int position
private boolean printLine
private boolean printSpace
private int redirects
private boolean tagBreak
private int wgramcount
private boolean withinIgnore
private MetaData metadata
private HyperLinks hyperlinks
private ImageLinks images
private HTMLMarkup markup
private Content content
Constructor Detail |
---|
public HTMLParser()
public HTMLParser(java.lang.String modelsPath)
modelsPath
- Description of the ParameterMethod Detail |
---|
public java.util.Iterator getNGrams()
public double getNGramRank(java.lang.String ngram)
ngram
- Description of the Parameter
private java.lang.String guessLanguage()
private void computeLanguageProfile()
private boolean advanceScanner() throws java.io.IOException
java.io.IOException
- Description of the Exceptionprivate java.lang.String analyseTagAux(java.lang.String tag, java.lang.String key) throws java.lang.StringIndexOutOfBoundsException
tag
- Description of the Parameterkey
- Description of the Parameter
java.lang.StringIndexOutOfBoundsException
- Description of the Exceptionpublic java.lang.String getBase()
private boolean checkPath(java.lang.String type)
type
- Description of the Parameter
public java.util.Iterator getLinks(java.lang.String word)
word
- Description of the Parameter
public java.util.Iterator getURLs(java.lang.String word)
word
- Description of the Parameter
public void initTokenizer(java.io.File input) throws java.lang.Exception
input
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.io.File input, java.lang.String encoding) throws java.lang.Exception
input
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.io.File input, java.net.URL base) throws java.lang.Exception
input
- Description of the Parameterbase
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.io.File input, java.lang.String encoding, java.net.URL base) throws java.lang.Exception
input
- Description of the Parameterbase
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.io.InputStream input) throws java.lang.Exception
input
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.io.InputStream input, java.lang.String encoding) throws java.lang.Exception
input
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionprivate void initTokenizer(java.io.InputStream input, boolean encoding) throws java.lang.Exception
input
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.io.InputStream input, java.net.URL base) throws java.lang.Exception
input
- Description of the Parameterbase
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.io.InputStream input, java.lang.String encoding, java.net.URL base) throws java.lang.Exception
input
- Description of the Parameterbase
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.net.URL input) throws java.lang.Exception
input
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.net.URL input, java.lang.String encoding) throws java.lang.Exception
input
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic void initTokenizer(java.net.URL input, int redirects) throws java.lang.Exception
java.lang.Exception
public void initTokenizer(java.net.URL input, java.lang.String encoding, int redirects) throws java.lang.Exception
input
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic java.lang.String getLanguage()
public void loadLanguageModels(java.lang.String modelsPath) throws java.lang.Exception
modelsPath
- Description of the Parameter
java.lang.Exception
- Description of the Exceptionpublic java.lang.String getName()
private java.lang.String getNextToken() throws java.io.IOException
java.io.IOException
- Description of the Exceptionpublic void processData()
private boolean processMetaTags(java.lang.String name2, java.lang.String value2)
name
- Description of the Parametervalue
- Description of the Parameter
public MetaData getMetaData()
public HTMLMarkup getHTMLMarkup()
public HyperLinks getHyperLinks()
public ImageLinks getImages()
public Content getContent()
private void processOthers()
private void processSpaces(boolean line)
line
- Description of the Parameterprivate void processSpecialChar() throws java.io.IOException
java.io.IOException
- Description of the Exceptionprivate boolean processTag() throws java.io.IOException
java.io.IOException
- Description of the Exceptionprivate int removeSpecialChars(java.lang.String s)
s
- Description of the Parameter
private java.lang.String removeSpecialChars2(java.lang.String s)
s
- Description of the Parameter
public void setMaxTerms(int max)
private void updateAnnotationCount(java.lang.String term, int annotation)
term
- The term for which the markup information is to be updatedannotation
- A bit flag with the markup information assigned to the termpublic java.util.Iterator getStems()
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |