Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions include/timbl/Targets.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,15 @@ namespace Timbl {
public:
Vfield( const TargetValue *val, int freq, double w ):
value(val), frequency(freq), weight(w) {};
Vfield( const Vfield& in ):
value(in.value), frequency(in.frequency), weight(in.weight) {};
Vfield& operator=( const Vfield& ) = delete; // forbid copies
~Vfield(){};
// Vfield is a plain value type: it is stored BY VALUE in
// ClassDistribution's vector and owns nothing (value is a non-owning
// pointer into the Targets table). Full value semantics are therefore
// required (vector needs move-assignment to keep itself sorted) and safe.
Vfield( const Vfield& ) = default;
Vfield( Vfield&& ) = default;
Vfield& operator=( const Vfield& ) = default;
Vfield& operator=( Vfield&& ) = default;
~Vfield() = default;
std::ostream& put( std::ostream& ) const;
const TargetValue *Value() const { return value; };
void Value( const TargetValue *t ){ value = t; };
Expand All @@ -123,7 +128,7 @@ namespace Timbl {
void DecFreq() { frequency -= 1; };
double Weight() const { return weight; };
void SetWeight( double w ){ weight = w; };
size_t Index();
size_t Index() const;
protected:
const TargetValue *value;
size_t frequency;
Expand All @@ -138,7 +143,14 @@ namespace Timbl {
friend std::ostream& operator<<( std::ostream&, const ClassDistribution * );
friend class WClassDistribution;
public:
using VDlist = std::map<size_t, Vfield *>;
// The distribution is stored as a flat vector of Vfield values, kept
// sorted by value->Index(). Compared to the former
// std::map<size_t,Vfield*> this removes, per target class, one heap
// allocation and the red-black-tree node overhead -- a large memory win
// since there is (potentially) one distribution per instance-base node.
// Lookups are a binary search; the sorted invariant also keeps
// (de)serialisation order stable.
using VDlist = std::vector<Vfield>;
using dist_iterator = VDlist::const_iterator;
ClassDistribution( ): total_items(0) {};
ClassDistribution( const ClassDistribution& );
Expand Down Expand Up @@ -176,6 +188,11 @@ namespace Timbl {
const TargetValue* BestTargetW( bool &, bool = false ) const;
virtual ClassDistribution *clone( ) const {
return new ClassDistribution(); };
// find the entry for target index 'id'; returns nullptr if absent.
Vfield *find_index( size_t id );
const Vfield *find_index( size_t id ) const;
// return the sorted insert position (lower_bound) for target index 'id'.
VDlist::iterator lower_index( size_t id );
size_t total_items;
VDlist distribution;
};
Expand Down
28 changes: 14 additions & 14 deletions src/Features.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ namespace Timbl {
// Loop over all present classes.
//
for ( const auto& tit : FV->TargetDist ){
FV->ValueClassProb->Assign( tit.second->Index(),
tit.second->Freq()/(double)freq );
FV->ValueClassProb->Assign( tit.Index(),
tit.Freq()/(double)freq );
}
}
}
Expand Down Expand Up @@ -313,7 +313,7 @@ namespace Timbl {
//
FVEntropy = 0.0;
for ( const auto& it : pnt->TargetDist ){
Prob = it.second->Freq()/(double)Freq;
Prob = it.Freq()/(double)Freq;
FVEntropy += Prob * Log2(Prob);
}
entropy += -FVEntropy * Freq / (double)TotalVals;
Expand Down Expand Up @@ -392,7 +392,7 @@ namespace Timbl {
if ( Freq > 0 ){
double FVEntropy = 0.0;
for ( const auto& tit : fv->TargetDist ){
double Prob = tit.second->Freq() / (double)Freq;
double Prob = tit.Freq() / (double)Freq;
FVEntropy += Prob * Log2(Prob);
}
entropy += -FVEntropy * Freq / (double)TotalVals;
Expand Down Expand Up @@ -442,8 +442,8 @@ namespace Timbl {
n_i_dot[i] = 0; // ALL values should be zeroed
const FeatureValue *fv = FVA[i];
for ( const auto& tit : fv->TargetDist ){
n_dot_j[tit.second->Index()-1] += tit.second->Freq();
n_i_dot[i] += tit.second->Freq();
n_dot_j[tit.Index()-1] += tit.Freq();
n_i_dot[i] += tit.Freq();
}
n_dot_dot += n_i_dot[i];
}
Expand All @@ -455,17 +455,17 @@ namespace Timbl {
if ( n >= Size ){
break;
}
while ( n < it.second->Index()-1 ){
while ( n < it.Index()-1 ){
double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) /
(double)n_dot_dot;
chi_square += tmp;
}
if ( n == it.second->Index()-1 ){
if ( n == it.Index()-1 ){
double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) /
(double)n_dot_dot;
if ( fabs(tmp) > Epsilon){
chi_square += ( (tmp - it.second->Freq()) *
(tmp - it.second->Freq()) ) / tmp;
chi_square += ( (tmp - it.Freq()) *
(tmp - it.Freq()) ) / tmp;
}
}
else {
Expand Down Expand Up @@ -496,8 +496,8 @@ namespace Timbl {
for ( const auto* fv : values_array ){
n_i_dot[i] = 0; // ALL values should be zeroed
for ( const auto& t_it : fv->TargetDist ){
long int fr = t_it.second->Freq();
n_dot_j[t_it.second->Index()-1] += fr;
long int fr = t_it.Freq();
n_dot_j[t_it.Index()-1] += fr;
n_i_dot[i] += fr;
}
n_dot_dot += n_i_dot[i];
Expand All @@ -511,8 +511,8 @@ namespace Timbl {
if ( n >= Size ){
break;
}
size_t id = t_it.second->Index()-1;
long int fr = t_it.second->Freq();
size_t id = t_it.Index()-1;
long int fr = t_it.Freq();
while ( n < id ){
double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) /
(double)n_dot_dot;
Expand Down
6 changes: 3 additions & 3 deletions src/MBLClass.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -1848,7 +1848,7 @@ namespace Timbl {
}
tester->init( Inst, EffectiveFeatures(), ib_offset );
auto lastpos = best_distrib->begin();
Vfield *Bpnt = lastpos->second;
const Vfield *Bpnt = ( lastpos != best_distrib->end() ) ? &*lastpos : nullptr;
size_t EffFeat = EffectiveFeatures() - ib_offset;
size_t CurPos = 0;
while ( Bpnt ) {
Expand All @@ -1873,7 +1873,7 @@ namespace Timbl {
CurPos = EndPos-1;
++lastpos;
if ( lastpos != best_distrib->end() ){
Bpnt = lastpos->second;
Bpnt = &*lastpos;
}
else {
best_distrib = IB->NextGraphTest( CurrentFV,
Expand All @@ -1882,7 +1882,7 @@ namespace Timbl {
if ( best_distrib ){
lastpos = best_distrib->begin();
if ( lastpos != best_distrib->end() ){
Bpnt = lastpos->second;
Bpnt = &*lastpos;
}
}
}
Expand Down
Loading
Loading