Essence

Centroid

Summary Included libraries Package variables Synopsis Description General documentation Methods

Summary
Essence::Centroid
Package variables top
No package variables defined.
Included modulestop
Essence::IDF
Essence::Text
strict
Inherit top
Exporter
Synopsistop
    use Essence::Centroid;
Descriptiontop
This class is used to compute a (string) cluster centroid from a cluster.
Methodstop
_build_centroidNo descriptionCode
_printNo descriptionCode
add_documentNo descriptionCode
add_documentsNo descriptionCode
centroid_scoreNo descriptionCode
compute_centroidNo descriptionCode
newNo descriptionCode
write_centroidNo descriptionCode

Methods description


Methods code

_build_centroiddescriptiontopprevnext
sub _build_centroid {
    my $self = shift;

    ## CONSTANTS.
## TODO: put them someplace more appropriate.
my $MIN_TFIDF = 3; my $MIN_CENTROID_SIZE = 8 * scalar @{$self->{documents}}; ## compute $self->{tf}{$word} = tf (total in all docs).
foreach my $doc (@{$self->{documents}}) { my $text; if (ref($doc) eq "Essence::WebDocument") { $text = $doc->get_text(); } elsif (! ref($doc)) { ## this is a scalar. treat it as a string.
$text = $doc; } my @words = split_words($text); foreach my $word (@words) { $self->{tf}{$word}++; } } ## vars.
my $word; my $tf; my $numdocs = scalar @{$self->{documents}}; ## make $self->{tf}{$word} the AVERAGE occurrences per doc.
## THIS DOESN'T REALLY MATTER!!!!
while (($word, $tf) = each %{$self->{tf}}) { $self->{tf}{$word} = $tf / $numdocs;
} ## fill in $self->{tfidf}{$word}
while (($word, $tf) = each %{$self->{tf}}) { my $idf = get_nidf($word); $self->{idf}{$word} = $idf; $self->{tfidf}{$word} = $tf * $idf; } ## fill in $self->{centroid}{$word}
my $count = 0; foreach my $word (sort { $self->{tfidf}{$b} <=> $self->{tfidf}{$a} } keys %{$self->{tfidf}}) { if ( $self->{tfidf}{$word} > $MIN_TFIDF || $count < $MIN_CENTROID_SIZE ) { $count++; $self->{centroid}{$word} = $self->{tfidf}{$word}; } ## else { }
} ## our stuff is now valid.
$self->{valid} = 1; return 1;
}
_printdescriptiontopprevnext
sub _print {
    my $self = shift;

    unless ($self->{valid}) {
	$self->_build_centroid();
    }

    my $word;
    foreach my $word (sort { $self->{tfidf}{$b} <=> $self->{tfidf}{$a} }
		      keys %{$self->{tfidf}}) {
	##while (($word, $tf) = each %{$self->{tf}}) {
printf "%-16.16s %10.2f", $word, $self->{tf}{$word}; printf " %15.10f", $self->{tfidf}{$word}; printf " %15.10f", $self->{centroid}{$word}; print "\n"; }
}
add_documentdescriptiontopprevnext
sub add_document {
    my ($self, $doc) = @_;
    $self->add_documents($doc);
}
add_documentsdescriptiontopprevnext
sub add_documents {
    my ($self, @docs) = @_;
    $self->{valid} = 0;
    push @{$self->{documents}}, @docs;
}
centroid_scoredescriptiontopprevnext
sub centroid_score {
    my ($self, $text) = @_;

    unless ($self->{valid}) {
	$self->_build_centroid();
    }

    my @words = split_words($text);

    my $score = 0;
    foreach my $word (@words) {
	$score += $self->{centroid}{$word};
    }

    return $score;
}
compute_centroiddescriptiontopprevnext
sub compute_centroid {
    my $cluster = shift;

    my $centroid = Essence::Centroid->new();

    foreach my $docref (values %$cluster) {
	my $doctext = "";
	foreach my $sentref (@$docref) {
	    my $text = $$sentref{'TEXT'};
	    $doctext .= " " . $text;
	}
	$centroid->add_document($doctext);
    }

    return $centroid;
}
newdescriptiontopprevnext
sub new {
    my $class = shift;

    my $self = {};
    bless $self, $class;

    $self->{valid} = 0;

    return $self;
}
write_centroiddescriptiontopprevnext
sub write_centroid {
    my $self = shift;
    my %args = @_;

    my $output = $args{'OUTPUT'} ||\* STDOUT;
    unless (ref $output) {
        open TEMP, ">$output" or
            die "Unable to open '$output' for printing extract.\n";
        $output =\* TEMP;
    }

    unless ($self->{valid}) {
        $self->_build_centroid();
    }

    foreach my $word (sort { $self->{tfidf}{$b} <=> $self->{tfidf}{$a} }
                      keys %{$self->{tfidf}}) {
        printf $output "%-16.16s", $word;
        printf $output " %15.10f", $self->{tf}{$word};
        printf $output " %15.10f", $self->{idf}{$word};
        printf $output " %15.10f", $self->{tfidf}{$word};
        print $output "\n";
    }
}

General documentation

No general documentation available.