Essence
Centroid
No package variables defined. |
This class is used to compute a (string) cluster centroid from a cluster.
|
_build_centroid | No description | Code |
_print | No description | Code |
add_document | No description | Code |
add_documents | No description | Code |
centroid_score | No description | Code |
compute_centroid | No description | Code |
new | No description | Code |
write_centroid | No description | Code |
Methods description
Methods code
_build_centroid | description | top | prev | next |
sub _build_centroid
{ my $self = shift;
## CONSTANTS.
## TODO: put them someplace more appropriate.
my $MIN_TFIDF = 3;
my $MIN_CENTROID_SIZE = 8 * scalar @{$self->{documents}};
## compute $self->{tf}{$word} = tf (total in all docs).
foreach my $doc (@{$self->{documents}}) {
my $text;
if (ref($doc) eq "Essence::WebDocument") {
$text = $doc->get_text();
} elsif (! ref($doc)) { ## this is a scalar. treat it as a string.
$text = $doc;
}
my @words = split_words($text);
foreach my $word (@words) {
$self->{tf}{$word}++;
}
}
## vars.
my $word;
my $tf;
my $numdocs = scalar @{$self->{documents}};
## make $self->{tf}{$word} the AVERAGE occurrences per doc.
## THIS DOESN'T REALLY MATTER!!!!
while (($word, $tf) = each %{$self->{tf}}) {
$self->{tf}{$word} = $tf / $numdocs;
}
## fill in $self->{tfidf}{$word}
while (($word, $tf) = each %{$self->{tf}}) {
my $idf = get_nidf($word);
$self->{idf}{$word} = $idf;
$self->{tfidf}{$word} = $tf * $idf;
}
## fill in $self->{centroid}{$word}
my $count = 0;
foreach my $word (sort { $self->{tfidf}{$b} <=> $self->{tfidf}{$a} }
keys %{$self->{tfidf}}) {
if ( $self->{tfidf}{$word} > $MIN_TFIDF ||
$count < $MIN_CENTROID_SIZE ) {
$count++;
$self->{centroid}{$word} = $self->{tfidf}{$word};
} ## else { }
}
## our stuff is now valid.
$self->{valid} = 1;
return 1;
}
sub _print
{ my $self = shift;
unless ($self->{valid}) {
$self->_build_centroid();
}
my $word;
foreach my $word (sort { $self->{tfidf}{$b} <=> $self->{tfidf}{$a} }
keys %{$self->{tfidf}}) {
##while (($word, $tf) = each %{$self->{tf}}) {
printf "%-16.16s %10.2f", $word, $self->{tf}{$word};
printf " %15.10f", $self->{tfidf}{$word};
printf " %15.10f", $self->{centroid}{$word};
print "\n";
}
}
sub add_document
{ my ($self, $doc) = @_;
$self->add_documents($doc);
}
sub add_documents
{ my ($self, @docs) = @_;
$self->{valid} = 0;
push @{$self->{documents}}, @docs;
}
sub centroid_score
{ my ($self, $text) = @_;
unless ($self->{valid}) {
$self->_build_centroid();
}
my @words = split_words($text);
my $score = 0;
foreach my $word (@words) {
$score += $self->{centroid}{$word};
}
return $score;
}
sub compute_centroid
{ my $cluster = shift;
my $centroid = Essence::Centroid->new();
foreach my $docref (values %$cluster) {
my $doctext = "";
foreach my $sentref (@$docref) {
my $text = $$sentref{'TEXT'};
$doctext .= " " . $text;
}
$centroid->add_document($doctext);
}
return $centroid;
}
sub new
{ my $class = shift;
my $self = {};
bless $self, $class;
$self->{valid} = 0;
return $self;
}
write_centroid | description | top | prev | next |
sub write_centroid
{ my $self = shift;
my %args = @_;
my $output = $args{'OUTPUT'} ||\* STDOUT;
unless (ref $output) {
open TEMP, ">$output" or
die "Unable to open '$output' for printing extract.\n";
$output =\* TEMP;
}
unless ($self->{valid}) {
$self->_build_centroid();
}
foreach my $word (sort { $self->{tfidf}{$b} <=> $self->{tfidf}{$a} }
keys %{$self->{tfidf}}) {
printf $output "%-16.16s", $word;
printf $output " %15.10f", $self->{tf}{$word};
printf $output " %15.10f", $self->{idf}{$word};
printf $output " %15.10f", $self->{tfidf}{$word};
print $output "\n";
}
}
General documentation
No general documentation available.