|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectpt.tumba.parser.Content
public class Content
Field Summary | |
---|---|
protected java.util.Map |
annotationCount
|
protected java.util.Map |
annotations
|
protected java.lang.StringBuffer |
content
|
private RabinHashFunction |
hashFunction
The rabin hash function used to produce an hash code for the content of the document |
private MetaData |
metadata
|
protected int |
numTokens
|
protected int |
numTokensWithStopWords
|
protected java.util.List |
terms
|
protected java.util.Map |
termsLinks
|
protected java.util.List |
textBlocks
|
protected java.util.Map |
wordGrams
|
Constructor Summary | |
---|---|
protected |
Content(MetaData metadata)
|
Method Summary | |
---|---|
protected int |
getCountAux(java.lang.String term,
int i)
|
java.lang.String |
getFilteredText()
Returns the text extracted from the document, with garbage sentences removed |
int |
getFrequency(java.lang.String word)
Gets the frequency attribute of the HTMLParser object |
java.util.Iterator |
getFrequentTerms(int minfreq)
Gets the frequentTerms attribute of the HTMLParser object |
long |
getHashCode()
Description of the Method |
double |
getMean(java.lang.String word)
Gets the frequency attribute of the HTMLParser object |
int |
getNumTokens()
Description of the Method |
java.lang.String |
getOriginalContent()
Description of the Method |
int[] |
getPositions(java.lang.String word)
Gets the positions attribute of the HTMLParser object |
int[] |
getPositions(java.lang.String word,
java.lang.String url)
Gets the positions attribute of the HTMLParser object |
int[] |
getSentences(java.lang.String word)
Gets the sentence positions attribute of the HTMLParser object |
int |
getTermInfo(java.lang.String word)
Gets the termInfo attribute of the HTMLParser object |
java.util.Iterator |
getTerms()
Returns the terms extracted from the document. |
java.lang.String |
getText()
Returns the text extracted from the document |
java.lang.String[] |
getTextBlocks()
Returns the sentences extracted from the documents |
java.lang.String |
getToken(int pos)
Returns the token at a given position in the document |
java.util.Iterator |
getTokens()
Returns the tokens extracted from the document |
double |
getTScore(java.lang.String word)
Gets the frequency attribute of the HTMLParser object |
double |
getVariance(java.lang.String word)
Gets the frequency attribute of the HTMLParser object |
int |
getWordGramFrequency(java.lang.String wgram)
Gets the nGramRank attribute of the HTMLParser object |
java.util.Iterator |
getWordGrams()
Return the sequences of 1 to 5 consequtive words extracted from the document |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
protected java.util.List terms
protected java.util.Map termsLinks
protected java.util.List textBlocks
protected java.util.Map wordGrams
protected java.util.Map annotationCount
protected java.util.Map annotations
protected java.lang.StringBuffer content
protected int numTokens
protected int numTokensWithStopWords
private RabinHashFunction hashFunction
private MetaData metadata
Constructor Detail |
---|
protected Content(MetaData metadata)
Method Detail |
---|
public java.util.Iterator getTerms()
public java.lang.String getText()
public java.lang.String getFilteredText()
public java.lang.String[] getTextBlocks()
public java.lang.String getToken(int pos)
pos
- The position within the document
public java.util.Iterator getTokens()
public java.util.Iterator getWordGrams()
public int getNumTokens()
public long getHashCode()
public int getFrequency(java.lang.String word)
word
- Description of the Parameter
public java.util.Iterator getFrequentTerms(int minfreq)
minfreq
- Description of the Parameter
protected int getCountAux(java.lang.String term, int i)
public double getMean(java.lang.String word)
word
- Description of the Parameter
public int[] getPositions(java.lang.String word)
word
- Description of the Parameter
public int[] getPositions(java.lang.String word, java.lang.String url)
word
- Description of the Parameterurl
- Description of the Parameter
public int[] getSentences(java.lang.String word)
word
- Description of the Parameter
public int getTermInfo(java.lang.String word)
word
- Description of the Parameter
public double getTScore(java.lang.String word)
word
- Description of the Parameter
public double getVariance(java.lang.String word)
word
- Description of the Parameter
public int getWordGramFrequency(java.lang.String wgram)
wgram
- Description of the Parameter
public java.lang.String getOriginalContent()
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |