MEAD
SentFeature
Privates (from my definitions) |
$debug = 0 |
%sentfeature; |
$curr_did; |
$UTF_8_to_Big5 = Text::Iconv->new("UTF-8", "BIG5") |
$curr_sno; |
combine_sentfeatures | No description | Code |
dummy_sub | No description | Code |
extract_sentfeatures | No description | Code |
read_sentfeature | No description | Code |
read_sentfeature_handle_start | No description | Code |
write_feature_vector | No description | Code |
write_footer | No description | Code |
write_header | No description | Code |
write_sentfeature | No description | Code |
Methods description
Methods code
combine_sentfeatures | description | top | prev | next |
sub combine_sentfeatures
{
my $destination = shift;
my @sources = @_;
foreach my $source (@sources) {
foreach my $did (keys %{$source}) {
my $source_docref = $$source{$did};
my $dest_docref = $$destination{$did};
## make sure that $destination has an entry for $did.
unless (defined $dest_docref) {
my @tempdoc = ();
$dest_docref =\@ tempdoc;
$$destination{$did} = $dest_docref;
}
for (my $sno = 1; $sno < @{$source_docref}; $sno++) {
my $source_sentref = $$source_docref[$sno];
my $dest_sentref = $$dest_docref[$sno];
## make sure that $dest_docref has an entry for $sno.
unless (defined $dest_sentref) {
my %tempsent = ();
$dest_sentref =\% tempsent;
$$dest_docref[$sno] = $dest_sentref;
}
foreach my $fname (keys %{$source_sentref}) {
$$dest_sentref{$fname} = $$source_sentref{$fname};
}
}
}
}
return $destination;
}
sub dummy_sub
{}
sub extract_sentfeatures
{
my $datadir = shift;
my $arg_hash = shift;
my $input_filehandle = shift ||\* STDIN;
my $cluster_sub = $$arg_hash{'Cluster'} ||\& dummy_sub;
my $document_sub = $$arg_hash{'Document'} ||\& dummy_sub;
my $sentence_sub = $$arg_hash{'Sentence'} ||\& dummy_sub;
##my %sentfeatures = {};
&write_header();
while (<$input_filehandle>) {
print if $debug;
/\s*(\S+)\s*(\S*)\s*$/ or
die "Expected cluster (and maybe also query), got:\n$_";
my $cluster_filename = $1;
my $query_filename = $2;
print "cluster_filename = $cluster_filename\n" if $debug;
print "query_filename = $query_filename\n" if $debug;
my $cluster = &read_cluster($cluster_filename, $datadir);
my $query;
if ($query_filename) {
$query = &read_query($query_filename);
}
&{ $cluster_sub }($cluster, $query);
foreach my $DID (keys %{ $cluster }) {
print "DID = $DID\n" if $debug;
my $document = $$cluster{$DID};
&{ $document_sub }($document, $DID);
shift @{ $document }; ## drop the dummy sentence # 0
##my %doc_hash = {};
##$sentfeatures{$DID} = \%doc_hash;
foreach my $sentence (@{ $document }) {
print "sentence...\n" if $debug;
my %feature_vector;
&{ $sentence_sub }(\%feature_vector, $sentence);
&write_feature_vector(\%feature_vector, $sentence);
##$$sentfeature{$DID}[$
}
print "finished DID $DID\n" if $debug;
}
print "finished cluster $cluster_filename\n" if $debug;
}
&write_footer();
}
sub read_sentfeature
{
my $source = shift ||\* STDIN;
%sentfeature = ();
undef $curr_did;
undef $curr_sno;
my $xml_parser = new XML::Parser(Handlers =>
{Start =>\& read_sentfeature_handle_start});
## $source can be a Filehandle or a GLOB
if (ref $source) {
$xml_parser->parse($source);
} else {
$xml_parser->parsefile($source);
}
return %sentfeature;
}
sub read_sentfeature_handle_start
{
shift; ##don't care about Expat
my $element_name = shift;
my %atts = @_;
if ($element_name eq 'S') {
$curr_did = $atts{'DID'};
$curr_sno = $atts{'SNO'};
} elsif ($element_name eq 'FEATURE') {
my @curr_doc = ();
my $curr_doc_ref = 0;
my %curr_sent = ();
my $curr_sent_ref = 0;
my $name = $atts{'N'};
my $value = $atts{'V'};
###if there's no document, we need to create everything
if (!($sentfeature{$curr_did})) {
$curr_sent{$name} = $value;
$curr_doc[$curr_sno] =\% curr_sent;
$sentfeature{$curr_did} =\@ curr_doc;
## print "Doc $curr_did not found. Added it along with\n";
## print " $curr_sno($name)=$curr_sent{$name}\n";
} else {
$curr_doc_ref = $sentfeature{$curr_did};
@curr_doc = @{$curr_doc_ref};
###if there's no sentence, we need to create a sentence
if (!($curr_doc[$curr_sno])) {
$curr_sent{$name} = $value;
$curr_doc[$curr_sno] =\% curr_sent;
$sentfeature{$curr_did} =\@ curr_doc;
} else { ###otherwise, just do what we normally would
my $sentref = $curr_doc[$curr_sno];
$$sentref{$name} = $value;
$curr_doc[$curr_sno] = $sentref;
$sentfeature{$curr_did} =\@ curr_doc;
##print "Added $name=${$sentref}{$name}\n";
}
}
}
}
sub write_feature_vector
{ my $feature_vector = shift;
my $sentence = shift;
if ($debug) {
foreach my $sent_key (keys %{ $sentence }) {
print "$sent_key => $$sentence{$sent_key}\t";
}
print "\n";
}
print "\t<S DID=\"$$sentence{'DID'}\" SNO=\"$$sentence{'SNO'}\" >\n";
foreach my $feature_name (keys %{ $feature_vector }) {
print "\t\t<FEATURE N=\"$feature_name\" V=\"$$feature_vector{$feature_name}\" />\n";
}
print "\t</S>\n";
}
sub write_footer
{ print "</SENT-FEATURE>\n";
}
sub write_header
{ print "<?xml version='1.0'?>\n";
print "<SENT-FEATURE>\n";
}
write_sentfeature | description | top | prev | next |
sub write_sentfeature
{
my $sentfeature_ref = shift;
my $destination = shift ||\* STDOUT;
## TODO: AJW 9/17
## Do something with $destination.
my $writer = new XML::Writer(DATA_MODE => 1, OUTPUT => $destination);
$writer->xmlDecl();
$writer->doctype("SENT-FEATURE", "",
"/clair/tools/mead/dtd/sentfeature.dtd");
$writer->startTag("SENT-FEATURE");
foreach my $did (keys %{$sentfeature_ref}) {
my $docref = $$sentfeature_ref{$did};
for (my $sno = 1; $sno < @{$docref}; $sno++) {
$writer->startTag("S", "DID"=>$did, "SNO"=>$sno);
my $sentref = $$docref[$sno];
## now print features
foreach my $fname (keys %{$sentref}) {
$writer->emptyTag("FEATURE", "N"=>$fname,
"V"=>$$sentref{$fname});
}
$writer->endTag();
}
}
$writer->endTag();
$writer->end();
}
General documentation
No general documentation available.