pt.tumba.parser
Class HTMLParser

java.lang.Object
  extended by pt.tumba.parser.HTMLParser

public class HTMLParser
extends java.lang.Object

Parser to extract metadata from HTML files

Author:
bmartins

Field Summary
private  java.lang.StringBuffer anchorText
           
private  int anotation
           
private  java.lang.String base
          The base address used to resolve links
private  Content content
           
private  java.lang.String[][] escape
           
private  java.util.Map escapeMap
           
private  boolean followRedirects
           
private  HyperLinks hyperlinks
           
private  ImageLinks images
           
private  java.io.InputStream input
          An input stream for the document
private  java.lang.String language
           
private  boolean languageComputation
           
private  pt.tumba.ngram.LanguageClass languageModels
           
private  java.util.List lastFormHidden
           
private  java.util.List lastFormSelect
           
private  java.lang.String lastFormURL
           
private  java.lang.String lastSelectName
           
private  java.lang.String lastSubmit
           
private  java.net.URL link
           
private  HTMLMarkup markup
           
private  int maxTerms
           
private  MetaData metadata
           
private  java.lang.String modelsPath
           
private  java.lang.String name
          The url or the filename of the document being parsed
private  int nextChar
          Internal storage for the current character read from the document
private  java.lang.Object nextToken
          Internal storage for the tokens parsed from the document
private  java.lang.StringBuffer output
          Temporary internal storage for the document text
private  int position
           
private  boolean printLine
          Flag to indicate the last caracter on the output is a linefeed
private  boolean printSpace
          Flag to indicate the last caracter on the output is a space
private  pt.tumba.ngram.EntryProfile profile
           
private  int redirects
          Number of folowed redirects (only 2 allowed)
private  boolean tagBreak
          Flag to indicate that after the last token there was a tag break
private  int tagsAnchor
           
private  int tagsBig
           
private  int tagsBold
           
private  int tagsEmphasize
           
private  int tagsHeading1
           
private  int tagsHeading2
           
private  int tagsHeading3
           
private  int tagsHeading4
           
private  int tagsHeading5
           
private  int tagsHeading6
           
private  int tagsItalic
           
private  int tagsSmall
           
private  int tagsStrong
           
private  int tagsTitle
           
private  boolean useEncoding
           
private  int wgramcount
           
private  boolean withinIgnore
          Flag to indicate the current token is within HTML comments or javascript code
 
Constructor Summary
HTMLParser()
          Constructor for the HTMLParser object
HTMLParser(java.lang.String modelsPath)
          Constructor for the HTMLParser object
 
Method Summary
private  boolean advanceScanner()
          Description of the Method
private  java.lang.String analyseTagAux(java.lang.String tag, java.lang.String key)
          Description of the Method
private  boolean checkPath(java.lang.String type)
          Description of the Method
private  void computeLanguageProfile()
           
 java.lang.String getBase()
          Description of the Method
 Content getContent()
           
 HTMLMarkup getHTMLMarkup()
           
 HyperLinks getHyperLinks()
           
 ImageLinks getImages()
           
 java.lang.String getLanguage()
          Description of the Method
 java.util.Iterator getLinks(java.lang.String word)
          Gets the sentence positions attribute of the HTMLParser object
 MetaData getMetaData()
           
 java.lang.String getName()
          Description of the Method
private  java.lang.String getNextToken()
          Description of the Method
 double getNGramRank(java.lang.String ngram)
          Gets the nGramRank attribute of the HTMLParser object
 java.util.Iterator getNGrams()
          Description of the Method
 java.util.Iterator getStems()
           
 java.util.Iterator getURLs(java.lang.String word)
          Description of the Method
private  java.lang.String guessLanguage()
          Description of the Method
 void initTokenizer(java.io.File input)
          Description of the Method
 void initTokenizer(java.io.File input, java.lang.String encoding)
          Description of the Method
 void initTokenizer(java.io.File input, java.lang.String encoding, java.net.URL base)
          Description of the Method
 void initTokenizer(java.io.File input, java.net.URL base)
          Description of the Method
 void initTokenizer(java.io.InputStream input)
          Description of the Method
private  void initTokenizer(java.io.InputStream input, boolean encoding)
          Description of the Method
 void initTokenizer(java.io.InputStream input, java.lang.String encoding)
          Description of the Method
 void initTokenizer(java.io.InputStream input, java.lang.String encoding, java.net.URL base)
          Description of the Method
 void initTokenizer(java.io.InputStream input, java.net.URL base)
          Description of the Method
 void initTokenizer(java.net.URL input)
          Description of the Method
 void initTokenizer(java.net.URL input, int redirects)
           
 void initTokenizer(java.net.URL input, java.lang.String encoding)
          Description of the Method
 void initTokenizer(java.net.URL input, java.lang.String encoding, int redirects)
          Description of the Method
 void loadLanguageModels(java.lang.String modelsPath)
          Description of the Method
 void processData()
          Description of the Method
private  boolean processMetaTags(java.lang.String name2, java.lang.String value2)
          Description of the Method
private  void processOthers()
          Description of the Method
private  void processSpaces(boolean line)
          Description of the Method
private  void processSpecialChar()
          Description of the Method
private  boolean processTag()
          Description of the Method
private  int removeSpecialChars(java.lang.String s)
          Description of the Method
private  java.lang.String removeSpecialChars2(java.lang.String s)
          Description of the Method
 void setMaxTerms(int max)
           
private  void updateAnnotationCount(java.lang.String term, int annotation)
          Updates the internal count regarding the HTML markup information for specific terms
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

profile

private pt.tumba.ngram.EntryProfile profile

languageModels

private pt.tumba.ngram.LanguageClass languageModels

anchorText

private java.lang.StringBuffer anchorText

tagsAnchor

private int tagsAnchor

tagsBig

private int tagsBig

tagsBold

private int tagsBold

tagsEmphasize

private int tagsEmphasize

tagsHeading1

private int tagsHeading1

tagsHeading2

private int tagsHeading2

tagsHeading3

private int tagsHeading3

tagsHeading4

private int tagsHeading4

tagsHeading5

private int tagsHeading5

tagsHeading6

private int tagsHeading6

tagsItalic

private int tagsItalic

tagsSmall

private int tagsSmall

tagsStrong

private int tagsStrong

tagsTitle

private int tagsTitle

anotation

private int anotation

base

private java.lang.String base
The base address used to resolve links


escape

private java.lang.String[][] escape

escapeMap

private java.util.Map escapeMap

followRedirects

private boolean followRedirects

useEncoding

private boolean useEncoding

input

private java.io.InputStream input
An input stream for the document


language

private java.lang.String language

languageComputation

private boolean languageComputation

lastFormHidden

private java.util.List lastFormHidden

lastFormSelect

private java.util.List lastFormSelect

lastFormURL

private java.lang.String lastFormURL

lastSelectName

private java.lang.String lastSelectName

lastSubmit

private java.lang.String lastSubmit

link

private java.net.URL link

maxTerms

private int maxTerms

modelsPath

private java.lang.String modelsPath

name

private java.lang.String name
The url or the filename of the document being parsed


nextChar

private int nextChar
Internal storage for the current character read from the document


nextToken

private java.lang.Object nextToken
Internal storage for the tokens parsed from the document


output

private java.lang.StringBuffer output
Temporary internal storage for the document text


position

private int position

printLine

private boolean printLine
Flag to indicate the last caracter on the output is a linefeed


printSpace

private boolean printSpace
Flag to indicate the last caracter on the output is a space


redirects

private int redirects
Number of folowed redirects (only 2 allowed)


tagBreak

private boolean tagBreak
Flag to indicate that after the last token there was a tag break


wgramcount

private int wgramcount

withinIgnore

private boolean withinIgnore
Flag to indicate the current token is within HTML comments or javascript code


metadata

private MetaData metadata

hyperlinks

private HyperLinks hyperlinks

images

private ImageLinks images

markup

private HTMLMarkup markup

content

private Content content
Constructor Detail

HTMLParser

public HTMLParser()
Constructor for the HTMLParser object


HTMLParser

public HTMLParser(java.lang.String modelsPath)
Constructor for the HTMLParser object

Parameters:
modelsPath - Description of the Parameter
Method Detail

getNGrams

public java.util.Iterator getNGrams()
Description of the Method

Returns:
Description of the Return Value

getNGramRank

public double getNGramRank(java.lang.String ngram)
Gets the nGramRank attribute of the HTMLParser object

Parameters:
ngram - Description of the Parameter
Returns:
The nGramRank value

guessLanguage

private java.lang.String guessLanguage()
Description of the Method

Returns:
Description of the Return Value

computeLanguageProfile

private void computeLanguageProfile()

advanceScanner

private boolean advanceScanner()
                        throws java.io.IOException
Description of the Method

Returns:
Description of the Return Value
Throws:
java.io.IOException - Description of the Exception

analyseTagAux

private java.lang.String analyseTagAux(java.lang.String tag,
                                       java.lang.String key)
                                throws java.lang.StringIndexOutOfBoundsException
Description of the Method

Parameters:
tag - Description of the Parameter
key - Description of the Parameter
Returns:
Description of the Return Value
Throws:
java.lang.StringIndexOutOfBoundsException - Description of the Exception

getBase

public java.lang.String getBase()
Description of the Method

Returns:
Description of the Return Value

checkPath

private boolean checkPath(java.lang.String type)
Description of the Method

Parameters:
type - Description of the Parameter
Returns:
Description of the Return Value

getLinks

public java.util.Iterator getLinks(java.lang.String word)
Gets the sentence positions attribute of the HTMLParser object

Parameters:
word - Description of the Parameter
Returns:
The positions value

getURLs

public java.util.Iterator getURLs(java.lang.String word)
Description of the Method

Parameters:
word - Description of the Parameter
Returns:
Description of the Return Value

initTokenizer

public void initTokenizer(java.io.File input)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.io.File input,
                          java.lang.String encoding)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.io.File input,
                          java.net.URL base)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
base - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.io.File input,
                          java.lang.String encoding,
                          java.net.URL base)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
base - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.io.InputStream input)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.io.InputStream input,
                          java.lang.String encoding)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

private void initTokenizer(java.io.InputStream input,
                           boolean encoding)
                    throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.io.InputStream input,
                          java.net.URL base)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
base - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.io.InputStream input,
                          java.lang.String encoding,
                          java.net.URL base)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
base - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.net.URL input)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.net.URL input,
                          java.lang.String encoding)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

initTokenizer

public void initTokenizer(java.net.URL input,
                          int redirects)
                   throws java.lang.Exception
Throws:
java.lang.Exception

initTokenizer

public void initTokenizer(java.net.URL input,
                          java.lang.String encoding,
                          int redirects)
                   throws java.lang.Exception
Description of the Method

Parameters:
input - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

getLanguage

public java.lang.String getLanguage()
Description of the Method

Returns:
Description of the Return Value

loadLanguageModels

public void loadLanguageModels(java.lang.String modelsPath)
                        throws java.lang.Exception
Description of the Method

Parameters:
modelsPath - Description of the Parameter
Throws:
java.lang.Exception - Description of the Exception

getName

public java.lang.String getName()
Description of the Method

Returns:
Description of the Return Value

getNextToken

private java.lang.String getNextToken()
                               throws java.io.IOException
Description of the Method

Returns:
Description of the Return Value
Throws:
java.io.IOException - Description of the Exception

processData

public void processData()
Description of the Method


processMetaTags

private boolean processMetaTags(java.lang.String name2,
                                java.lang.String value2)
Description of the Method

Parameters:
name - Description of the Parameter
value - Description of the Parameter
Returns:
Description of the Return Value

getMetaData

public MetaData getMetaData()

getHTMLMarkup

public HTMLMarkup getHTMLMarkup()

getHyperLinks

public HyperLinks getHyperLinks()

getImages

public ImageLinks getImages()

getContent

public Content getContent()

processOthers

private void processOthers()
Description of the Method


processSpaces

private void processSpaces(boolean line)
Description of the Method

Parameters:
line - Description of the Parameter

processSpecialChar

private void processSpecialChar()
                         throws java.io.IOException
Description of the Method

Throws:
java.io.IOException - Description of the Exception

processTag

private boolean processTag()
                    throws java.io.IOException
Description of the Method

Returns:
Description of the Return Value
Throws:
java.io.IOException - Description of the Exception

removeSpecialChars

private int removeSpecialChars(java.lang.String s)
Description of the Method

Parameters:
s - Description of the Parameter
Returns:
Description of the Return Value

removeSpecialChars2

private java.lang.String removeSpecialChars2(java.lang.String s)
Description of the Method

Parameters:
s - Description of the Parameter
Returns:
Description of the Return Value

setMaxTerms

public void setMaxTerms(int max)

updateAnnotationCount

private void updateAnnotationCount(java.lang.String term,
                                   int annotation)
Updates the internal count regarding the HTML markup information for specific terms

Parameters:
term - The term for which the markup information is to be updated
annotation - A bit flag with the markup information assigned to the term

getStems

public java.util.Iterator getStems()