pt.tumba.parser
Class Content

java.lang.Object
  extended by pt.tumba.parser.Content

public class Content
extends java.lang.Object

Author:
Bruno Martinss

Field Summary
protected  java.util.Map annotationCount
           
protected  java.util.Map annotations
           
protected  java.lang.StringBuffer content
           
private  RabinHashFunction hashFunction
          The rabin hash function used to produce an hash code for the content of the document
private  MetaData metadata
           
protected  int numTokens
           
protected  int numTokensWithStopWords
           
protected  java.util.List terms
           
protected  java.util.Map termsLinks
           
protected  java.util.List textBlocks
           
protected  java.util.Map wordGrams
           
 
Constructor Summary
protected Content(MetaData metadata)
           
 
Method Summary
protected  int getCountAux(java.lang.String term, int i)
           
 java.lang.String getFilteredText()
          Returns the text extracted from the document, with garbage sentences removed
 int getFrequency(java.lang.String word)
          Gets the frequency attribute of the HTMLParser object
 java.util.Iterator getFrequentTerms(int minfreq)
          Gets the frequentTerms attribute of the HTMLParser object
 long getHashCode()
          Description of the Method
 double getMean(java.lang.String word)
          Gets the frequency attribute of the HTMLParser object
 int getNumTokens()
          Description of the Method
 java.lang.String getOriginalContent()
          Description of the Method
 int[] getPositions(java.lang.String word)
          Gets the positions attribute of the HTMLParser object
 int[] getPositions(java.lang.String word, java.lang.String url)
          Gets the positions attribute of the HTMLParser object
 int[] getSentences(java.lang.String word)
          Gets the sentence positions attribute of the HTMLParser object
 int getTermInfo(java.lang.String word)
          Gets the termInfo attribute of the HTMLParser object
 java.util.Iterator getTerms()
          Returns the terms extracted from the document.
 java.lang.String getText()
          Returns the text extracted from the document
 java.lang.String[] getTextBlocks()
          Returns the sentences extracted from the documents
 java.lang.String getToken(int pos)
          Returns the token at a given position in the document
 java.util.Iterator getTokens()
          Returns the tokens extracted from the document
 double getTScore(java.lang.String word)
          Gets the frequency attribute of the HTMLParser object
 double getVariance(java.lang.String word)
          Gets the frequency attribute of the HTMLParser object
 int getWordGramFrequency(java.lang.String wgram)
          Gets the nGramRank attribute of the HTMLParser object
 java.util.Iterator getWordGrams()
          Return the sequences of 1 to 5 consequtive words extracted from the document
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

terms

protected java.util.List terms

termsLinks

protected java.util.Map termsLinks

textBlocks

protected java.util.List textBlocks

wordGrams

protected java.util.Map wordGrams

annotationCount

protected java.util.Map annotationCount

annotations

protected java.util.Map annotations

content

protected java.lang.StringBuffer content

numTokens

protected int numTokens

numTokensWithStopWords

protected int numTokensWithStopWords

hashFunction

private RabinHashFunction hashFunction
The rabin hash function used to produce an hash code for the content of the document


metadata

private MetaData metadata
Constructor Detail

Content

protected Content(MetaData metadata)
Method Detail

getTerms

public java.util.Iterator getTerms()
Returns the terms extracted from the document. The difference for the getTokens method is that this one filters the stop-words.

Returns:
An iterator over the terms extracted from the document

getText

public java.lang.String getText()
Returns the text extracted from the document

Returns:
A String with the text extracted from the document

getFilteredText

public java.lang.String getFilteredText()
Returns the text extracted from the document, with garbage sentences removed

Returns:
A String with the text extracted from the document

getTextBlocks

public java.lang.String[] getTextBlocks()
Returns the sentences extracted from the documents

Returns:
An array of Strings with the sentences extracted from the documents

getToken

public java.lang.String getToken(int pos)
Returns the token at a given position in the document

Parameters:
pos - The position within the document
Returns:
The token at the given position in the document

getTokens

public java.util.Iterator getTokens()
Returns the tokens extracted from the document

Returns:
An iterator over the tokens extracted from the document

getWordGrams

public java.util.Iterator getWordGrams()
Return the sequences of 1 to 5 consequtive words extracted from the document

Returns:
An iterator over the word-grams of length 1 to 5 extracted from the document

getNumTokens

public int getNumTokens()
Description of the Method

Returns:
Description of the Return Value

getHashCode

public long getHashCode()
Description of the Method

Returns:
Description of the Return Value

getFrequency

public int getFrequency(java.lang.String word)
Gets the frequency attribute of the HTMLParser object

Parameters:
word - Description of the Parameter
Returns:
The frequency value

getFrequentTerms

public java.util.Iterator getFrequentTerms(int minfreq)
Gets the frequentTerms attribute of the HTMLParser object

Parameters:
minfreq - Description of the Parameter
Returns:
The frequentTerms value

getCountAux

protected int getCountAux(java.lang.String term,
                          int i)

getMean

public double getMean(java.lang.String word)
Gets the frequency attribute of the HTMLParser object

Parameters:
word - Description of the Parameter
Returns:
The frequency value

getPositions

public int[] getPositions(java.lang.String word)
Gets the positions attribute of the HTMLParser object

Parameters:
word - Description of the Parameter
Returns:
The positions value

getPositions

public int[] getPositions(java.lang.String word,
                          java.lang.String url)
Gets the positions attribute of the HTMLParser object

Parameters:
word - Description of the Parameter
url - Description of the Parameter
Returns:
The positions value

getSentences

public int[] getSentences(java.lang.String word)
Gets the sentence positions attribute of the HTMLParser object

Parameters:
word - Description of the Parameter
Returns:
The positions value

getTermInfo

public int getTermInfo(java.lang.String word)
Gets the termInfo attribute of the HTMLParser object

Parameters:
word - Description of the Parameter
Returns:
The termInfo value

getTScore

public double getTScore(java.lang.String word)
Gets the frequency attribute of the HTMLParser object

Parameters:
word - Description of the Parameter
Returns:
The frequency value

getVariance

public double getVariance(java.lang.String word)
Gets the frequency attribute of the HTMLParser object

Parameters:
word - Description of the Parameter
Returns:
The frequency value

getWordGramFrequency

public int getWordGramFrequency(java.lang.String wgram)
Gets the nGramRank attribute of the HTMLParser object

Parameters:
wgram - Description of the Parameter
Returns:
The nGramRank value

getOriginalContent

public java.lang.String getOriginalContent()
Description of the Method

Returns:
Description of the Return Value