diff --git a/CMakeExternals/Vigra.patch b/CMakeExternals/Vigra.patch index 5198386a5e..297e23a8a8 100644 --- a/CMakeExternals/Vigra.patch +++ b/CMakeExternals/Vigra.patch @@ -1,1566 +1,1566 @@ diff -urNb vigra-Version-1-10-0/CMakeLists.txt Vigra-src/CMakeLists.txt --- vigra-Version-1-10-0/CMakeLists.txt 2013-11-18 17:48:16.000000000 +0100 +++ Vigra-src/CMakeLists.txt 2015-02-24 14:31:03.716528000 +0100 @@ -70,8 +70,11 @@ ENDIF() - + IF(WITH_HDF5) -+ FIND_PACKAGE(HDF5 PATHS "/local/mitk/release-binary/MITK-superbuild/HDF5-install" PATH_SUFFIXES hdf5 NO_DEFAULT_PATH NO_MODULE) ++ FIND_PACKAGE(HDF5 PATHS ${HDF5_DIR} PATH_SUFFIXES hdf5 NO_DEFAULT_PATH NO_MODULE) +IF(NOT HDF5_FOUND) VIGRA_FIND_PACKAGE(HDF5) ENDIF() +ENDIF() - + IF(WITH_BOOST_GRAPH) IF(WITH_VIGRANUMPY) @@ -395,3 +398,4 @@ ENDIF() - + MESSAGE( STATUS "---------------------------------------------------------" ) + diff -urNb vigra-Version-1-10-0/config/VigraConfig.cmake.in Vigra-src/config/VigraConfig.cmake.in --- vigra-Version-1-10-0/config/VigraConfig.cmake.in 2013-11-18 17:48:16.000000000 +0100 +++ Vigra-src/config/VigraConfig.cmake.in 2015-02-24 14:31:03.716528000 +0100 @@ -1,7 +1,9 @@ get_filename_component(SELF_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) get_filename_component(Vigra_TOP_DIR "${SELF_DIR}/../../" ABSOLUTE) - + -include(${SELF_DIR}/vigra-targets.cmake) +if(NOT TARGET vigraimpex) + include(${SELF_DIR}/vigra-targets.cmake) +endif() get_target_property(VIGRA_TYPE vigraimpex TYPE) if(${VIGRA_TYPE} STREQUAL "STATIC_LIBRARY") SET(VIGRA_STATIC_LIB True) @@ -12,3 +14,4 @@ IF(EXISTS ${SELF_DIR}/../vigranumpy/VigranumpyConfig.cmake) INCLUDE(${SELF_DIR}/../vigranumpy/VigranumpyConfig.cmake) ENDIF() + diff -urNb vigra-Version-1-10-0/include/vigra/random_forest/rf_decisionTree.hxx Vigra-src/include/vigra/random_forest/rf_decisionTree.hxx --- vigra-Version-1-10-0/include/vigra/random_forest/rf_decisionTree.hxx 2013-11-18 17:48:16.000000000 +0100 +++ Vigra-src/include/vigra/random_forest/rf_decisionTree.hxx 2015-01-16 19:16:34.000000000 +0100 @@ -90,6 +90,8 @@ ProblemSpec<> ext_param_; unsigned int classCount_; + std::map m_Parents; + public: /* \brief Create tree with parameters */ @@ -350,6 +352,22 @@ continueLearn(features,labels,stack_entry,split,stop,visitor,randint); } +template < class TRandomForest> +int GetTreeDepthForNode(int nodeIndex, TRandomForest* rf) +{ + int depth = 0; + while (true) + { + if (nodeIndex < 1) + { + break; + } + ++depth; + nodeIndex = rf->m_Parents[nodeIndex]; + } + return depth; +} + template < class U, class C, class U2, class C2, class StackEntry_t, @@ -374,6 +392,11 @@ size_t last_node_pos = 0; StackEntry_t top=stack.back(); + Split_t* splitPointer = &split; + bool isDepthSplitter = true; + + int maximumTreeDepth = splitPointer->GetMaximumTreeDepth(); + while(!stack.empty()) { @@ -392,7 +415,20 @@ //kind of node to make TreeInt NodeID; - if(stop(top)) + bool depthStop = false; + if (isDepthSplitter) + { + int currentDepthLevel; + if (top.leftParent != StackEntry_t::DecisionTreeNoParent) + currentDepthLevel = GetTreeDepthForNode(top.leftParent, this); + else + currentDepthLevel = GetTreeDepthForNode(top.rightParent, this); + + depthStop = (currentDepthLevel >= maximumTreeDepth); + } + if(stop(top) || (depthStop)) + + //if (stop(top) || currentDepthLevel >= MaximumSplitDepth(split)) NodeID = split.makeTerminalNode(features, labels, top, @@ -421,17 +457,20 @@ // Using InteriorNodeBase because exact parameter form not needed. // look at the Node base before getting scared. last_node_pos = topology_.size(); + m_Parents[last_node_pos] = StackEntry_t::DecisionTreeNoParent; if(top.leftParent != StackEntry_t::DecisionTreeNoParent) { NodeBase(topology_, parameters_, top.leftParent).child(0) = last_node_pos; + m_Parents[last_node_pos] = top.leftParent; } else if(top.rightParent != StackEntry_t::DecisionTreeNoParent) { NodeBase(topology_, parameters_, top.rightParent).child(1) = last_node_pos; + m_Parents[last_node_pos] = top.rightParent; } diff -urNb vigra-Version-1-10-0/include/vigra/random_forest/rf_nodeproxy.hxx Vigra-src/include/vigra/random_forest/rf_nodeproxy.hxx --- vigra-Version-1-10-0/include/vigra/random_forest/rf_nodeproxy.hxx 2013-11-18 17:48:16.000000000 +0100 +++ Vigra-src/include/vigra/random_forest/rf_nodeproxy.hxx 2015-01-13 16:56:56.000000000 +0100 @@ -50,7 +50,11 @@ namespace vigra { - +class DepthSplitterBase +{ +public: + virtual int GetMaximumTreeDepth() const = 0; +}; enum NodeTags { diff -urNb vigra-Version-1-10-0/include/vigra/random_forest/rf_split.hxx Vigra-src/include/vigra/random_forest/rf_split.hxx --- vigra-Version-1-10-0/include/vigra/random_forest/rf_split.hxx 2013-11-18 17:48:16.000000000 +0100 +++ Vigra-src/include/vigra/random_forest/rf_split.hxx 2015-01-16 19:11:02.000000000 +0100 @@ -108,6 +108,11 @@ \ref SplitBase::findBestSplit() or \ref SplitBase::makeTerminalNode(). **/ + virtual int GetMaximumTreeDepth() + { + return 1000000000; + } + template void set_external_parameters(ProblemSpec const & in) { diff -urNb vigra-Version-1-10-0/include/vigra/random_forest/rf_split-old.hxx Vigra-src/include/vigra/random_forest/rf_split-old.hxx --- vigra-Version-1-10-0/include/vigra/random_forest/rf_split-old.hxx 1970-01-01 01:00:00.000000000 +0100 +++ Vigra-src/include/vigra/random_forest/rf_split-old.hxx 2014-03-31 08:46:18.000000000 +0200 @@ -0,0 +1,1383 @@ +/************************************************************************/ +/* */ +/* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */ +/* */ +/* This file is part of the VIGRA computer vision library. */ +/* The VIGRA Website is */ +/* http://hci.iwr.uni-heidelberg.de/vigra/ */ +/* Please direct questions, bug reports, and contributions to */ +/* ullrich.koethe@iwr.uni-heidelberg.de or */ +/* vigra@informatik.uni-hamburg.de */ +/* */ +/* Permission is hereby granted, free of charge, to any person */ +/* obtaining a copy of this software and associated documentation */ +/* files (the "Software"), to deal in the Software without */ +/* restriction, including without limitation the rights to use, */ +/* copy, modify, merge, publish, distribute, sublicense, and/or */ +/* sell copies of the Software, and to permit persons to whom the */ +/* Software is furnished to do so, subject to the following */ +/* conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the */ +/* Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */ +/* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */ +/* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */ +/* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */ +/* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */ +/* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */ +/* OTHER DEALINGS IN THE SOFTWARE. */ +/* */ +/************************************************************************/ +#ifndef VIGRA_RANDOM_FOREST_SPLIT_HXX +#define VIGRA_RANDOM_FOREST_SPLIT_HXX +#include +#include +#include +#include +#include +#include "../mathutil.hxx" +#include "../array_vector.hxx" +#include "../sized_int.hxx" +#include "../matrix.hxx" +#include "../random.hxx" +#include "../functorexpression.hxx" +#include "rf_nodeproxy.hxx" +//#include "rf_sampling.hxx" +#include "rf_region.hxx" +//#include "../hokashyap.hxx" +//#include "vigra/rf_helpers.hxx" + +namespace vigra +{ + +// Incomplete Class to ensure that findBestSplit is always implemented in +// the derived classes of SplitBase +class CompileTimeError; + + +namespace detail +{ + template + class Normalise + { + public: + template + static void exec(Iter begin, Iter end) + {} + }; + + template<> + class Normalise + { + public: + template + static void exec (Iter begin, Iter end) + { + double bla = std::accumulate(begin, end, 0.0); + for(int ii = 0; ii < end - begin; ++ii) + begin[ii] = begin[ii]/bla ; + } + }; +} + + +/** Base Class for all SplitFunctors used with the \ref RandomForest class + defines the interface used while learning a tree. +**/ +template +class SplitBase +{ + public: + + typedef Tag RF_Tag; + typedef DT_StackEntry::iterator> + StackEntry_t; + + ProblemSpec<> ext_param_; + + NodeBase::T_Container_type t_data; + NodeBase::P_Container_type p_data; + + NodeBase node_; + + /** returns the DecisionTree Node created by + \ref SplitBase::findBestSplit() or \ref SplitBase::makeTerminalNode(). + **/ + + template + void set_external_parameters(ProblemSpec const & in) + { + ext_param_ = in; + t_data.push_back(in.column_count_); + t_data.push_back(in.class_count_); + } + + NodeBase & createNode() + { + return node_; + } + + int classCount() const + { + return int(t_data[1]); + } + + int featureCount() const + { + return int(t_data[0]); + } + + /** resets internal data. Should always be called before + calling findBestSplit or makeTerminalNode + **/ + void reset() + { + t_data.resize(2); + p_data.resize(0); + } + + + /** findBestSplit has to be re-implemented in derived split functor. + The defaut implementation only insures that a CompileTime error is issued + if no such method was defined. + **/ + + template + int findBestSplit(MultiArrayView<2, T, C> features, + MultiArrayView<2, T2, C2> labels, + Region region, + ArrayVector childs, + Random randint) + { +#ifndef __clang__ + // FIXME: This compile-time checking trick does not work for clang. + CompileTimeError SplitFunctor__findBestSplit_member_was_not_defined; +#endif + return 0; + } + + /** Default action for creating a terminal Node. + sets the Class probability of the remaining region according to + the class histogram + **/ + template + int makeTerminalNode(MultiArrayView<2, T, C> features, + MultiArrayView<2, T2, C2> labels, + Region & region, + Random randint) + { + Node ret(t_data, p_data); + node_ = ret; + if(ext_param_.class_weights_.size() != region.classCounts().size()) + { + std::copy(region.classCounts().begin(), + region.classCounts().end(), + ret.prob_begin()); + } + else + { + std::transform(region.classCounts().begin(), + region.classCounts().end(), + ext_param_.class_weights_.begin(), + ret.prob_begin(), std::multiplies()); + } + detail::Normalise::exec(ret.prob_begin(), ret.prob_end()); +// std::copy(ret.prob_begin(), ret.prob_end(), std::ostream_iterator(std::cerr, ", " )); +// std::cerr << std::endl; + ret.weights() = region.size(); + return e_ConstProbNode; + } + + +}; + +/** Functor to sort the indices of a feature Matrix by a certain dimension +**/ +template +class SortSamplesByDimensions +{ + DataMatrix const & data_; + MultiArrayIndex sortColumn_; + double thresVal_; + public: + + SortSamplesByDimensions(DataMatrix const & data, + MultiArrayIndex sortColumn, + double thresVal = 0.0) + : data_(data), + sortColumn_(sortColumn), + thresVal_(thresVal) + {} + + void setColumn(MultiArrayIndex sortColumn) + { + sortColumn_ = sortColumn; + } + void setThreshold(double value) + { + thresVal_ = value; + } + + bool operator()(MultiArrayIndex l, MultiArrayIndex r) const + { + return data_(l, sortColumn_) < data_(r, sortColumn_); + } + bool operator()(MultiArrayIndex l) const + { + return data_(l, sortColumn_) < thresVal_; + } +}; + +template +class DimensionNotEqual +{ + DataMatrix const & data_; + MultiArrayIndex sortColumn_; + + public: + + DimensionNotEqual(DataMatrix const & data, + MultiArrayIndex sortColumn) + : data_(data), + sortColumn_(sortColumn) + {} + + void setColumn(MultiArrayIndex sortColumn) + { + sortColumn_ = sortColumn; + } + + bool operator()(MultiArrayIndex l, MultiArrayIndex r) const + { + return data_(l, sortColumn_) != data_(r, sortColumn_); + } +}; + +template +class SortSamplesByHyperplane +{ + DataMatrix const & data_; + Node const & node_; + + public: + + SortSamplesByHyperplane(DataMatrix const & data, + Node const & node) + : + data_(data), + node_(node) + {} + + /** calculate the distance of a sample point to a hyperplane + */ + double operator[](MultiArrayIndex l) const + { + double result_l = -1 * node_.intercept(); + for(int ii = 0; ii < node_.columns_size(); ++ii) + { + result_l += rowVector(data_, l)[node_.columns_begin()[ii]] + * node_.weights()[ii]; + } + return result_l; + } + + bool operator()(MultiArrayIndex l, MultiArrayIndex r) const + { + return (*this)[l] < (*this)[r]; + } + +}; + +/** makes a Class Histogram given indices in a labels_ array + * usage: + * MultiArrayView<2, T2, C2> labels = makeSomeLabels() + * ArrayVector hist(numberOfLabels(labels), 0); + * RandomForestClassCounter counter(labels, hist); + * + * Container indices = getSomeIndices() + * std::for_each(indices, counter); + */ +template +class RandomForestClassCounter +{ + DataSource const & labels_; + CountArray & counts_; + + public: + + RandomForestClassCounter(DataSource const & labels, + CountArray & counts) + : labels_(labels), + counts_(counts) + { + reset(); + } + + void reset() + { + counts_.init(0); + } + + void operator()(MultiArrayIndex l) const + { + counts_[labels_[l]] +=1; + } +}; + + +/** Functor To Calculate the Best possible Split Based on the Gini Index + given Labels and Features along a given Axis +*/ + +namespace detail +{ + template + class ConstArr + { + public: + double operator[](size_t) const + { + return (double)N; + } + }; + + +} + + + + +/** Functor to calculate the entropy based impurity + */ +class EntropyCriterion +{ +public: + /**calculate the weighted gini impurity based on class histogram + * and class weights + */ + template + double operator() (Array const & hist, + Array2 const & weights, + double total = 1.0) const + { + return impurity(hist, weights, total); + } + + /** calculate the gini based impurity based on class histogram + */ + template + double operator()(Array const & hist, double total = 1.0) const + { + return impurity(hist, total); + } + + /** static version of operator(hist total) + */ + template + static double impurity(Array const & hist, double total) + { + return impurity(hist, detail::ConstArr<1>(), total); + } + + /** static version of operator(hist, weights, total) + */ + template + static double impurity (Array const & hist, + Array2 const & weights, + double total) + { + + int class_count = hist.size(); + double entropy = 0.0; + if(class_count == 2) + { + double p0 = (hist[0]/total); + double p1 = (hist[1]/total); + entropy = 0 - weights[0]*p0*std::log(p0) - weights[1]*p1*std::log(p1); + } + else + { + for(int ii = 0; ii < class_count; ++ii) + { + double w = weights[ii]; + double pii = hist[ii]/total; + entropy -= w*( pii*std::log(pii)); + } + } + entropy = total * entropy; + return entropy; + } +}; + +/** Functor to calculate the gini impurity + */ +class GiniCriterion +{ +public: + /**calculate the weighted gini impurity based on class histogram + * and class weights + */ + template + double operator() (Array const & hist, + Array2 const & weights, + double total = 1.0) const + { + return impurity(hist, weights, total); + } + + /** calculate the gini based impurity based on class histogram + */ + template + double operator()(Array const & hist, double total = 1.0) const + { + return impurity(hist, total); + } + + /** static version of operator(hist total) + */ + template + static double impurity(Array const & hist, double total) + { + return impurity(hist, detail::ConstArr<1>(), total); + } + + /** static version of operator(hist, weights, total) + */ + template + static double impurity (Array const & hist, + Array2 const & weights, + double total) + { + + int class_count = hist.size(); + double gini = 0.0; + if(class_count == 2) + { + double w = weights[0] * weights[1]; + gini = w * (hist[0] * hist[1] / total); + } + else + { + // Gini Impurity is defined as + // gini = sum_features-i ( p(i) * (1 - p(i) )) + for(int ii = 0; ii < class_count; ++ii) + { + double w = weights[ii]; + gini += w*( hist[ii]*( 1.0 - w * hist[ii]/total ) ); + } + gini /= total; + } + return gini; + } +}; + + +template +class ImpurityLoss +{ + + DataSource const & labels_; + ArrayVector counts_; + ArrayVector const class_weights_; + double total_counts_; + Impurity impurity_; + + public: + + template + ImpurityLoss(DataSource const & labels, + ProblemSpec const & ext_) + : labels_(labels), + counts_(ext_.class_count_, 0.0), + class_weights_(ext_.class_weights_), + total_counts_(0.0) + {} + + void reset() + { + counts_.init(0); + total_counts_ = 0.0; + } + + template + double increment_histogram(Counts const & counts) + { + std::transform(counts.begin(), counts.end(), + counts_.begin(), counts_.begin(), + std::plus()); + total_counts_ = std::accumulate( counts_.begin(), + counts_.end(), + 0.0); + return impurity_(counts_, class_weights_, total_counts_); + } + + template + double decrement_histogram(Counts const & counts) + { + std::transform(counts.begin(), counts.end(), + counts_.begin(), counts_.begin(), + std::minus()); + total_counts_ = std::accumulate( counts_.begin(), + counts_.end(), + 0.0); + return impurity_(counts_, class_weights_, total_counts_); + } + + template + double increment(Iter begin, Iter end) + { + for(Iter iter = begin; iter != end; ++iter) + { + counts_[labels_(*iter, 0)] +=1.0; + total_counts_ +=1.0; + } + return impurity_(counts_, class_weights_, total_counts_); + } + + template + double decrement(Iter const & begin, Iter const & end) + { + for(Iter iter = begin; iter != end; ++iter) + { + counts_[labels_(*iter,0)] -=1.0; + total_counts_ -=1.0; + } + return impurity_(counts_, class_weights_, total_counts_); + } + + template + double init (Iter begin, Iter end, Resp_t resp) + { + reset(); + std::copy(resp.begin(), resp.end(), counts_.begin()); + total_counts_ = std::accumulate(counts_.begin(), counts_.end(), 0.0); + return impurity_(counts_,class_weights_, total_counts_); + } + + ArrayVector const & response() + { + return counts_; + } +}; + + + + template + class RegressionForestCounter + { + public: + typedef MultiArrayShape<2>::type Shp; + DataSource const & labels_; + ArrayVector mean_; + ArrayVector variance_; + ArrayVector tmp_; + size_t count_; + int* end_; + + template + RegressionForestCounter(DataSource const & labels, + ProblemSpec const & ext_) + : + labels_(labels), + mean_(ext_.response_size_, 0.0), + variance_(ext_.response_size_, 0.0), + tmp_(ext_.response_size_), + count_(0) + {} + + template + double increment (Iter begin, Iter end) + { + for(Iter iter = begin; iter != end; ++iter) + { + ++count_; + for(unsigned int ii = 0; ii < mean_.size(); ++ii) + tmp_[ii] = labels_(*iter, ii) - mean_[ii]; + double f = 1.0 / count_, + f1 = 1.0 - f; + for(unsigned int ii = 0; ii < mean_.size(); ++ii) + mean_[ii] += f*tmp_[ii]; + for(unsigned int ii = 0; ii < mean_.size(); ++ii) + variance_[ii] += f1*sq(tmp_[ii]); + } + double res = std::accumulate(variance_.begin(), + variance_.end(), + 0.0, + std::plus()); + //std::cerr << res << " ) = "; + return res; + } + + template //This is BROKEN + double decrement (Iter begin, Iter end) + { + for(Iter iter = begin; iter != end; ++iter) + { + --count_; + } + + begin = end; + end = end + count_; + + + for(unsigned int ii = 0; ii < mean_.size(); ++ii) + { + mean_[ii] = 0; + for(Iter iter = begin; iter != end; ++iter) + { + mean_[ii] += labels_(*iter, ii); + } + mean_[ii] /= count_; + variance_[ii] = 0; + for(Iter iter = begin; iter != end; ++iter) + { + variance_[ii] += (labels_(*iter, ii) - mean_[ii])*(labels_(*iter, ii) - mean_[ii]); + } + } + double res = std::accumulate(variance_.begin(), + variance_.end(), + 0.0, + std::plus()); + //std::cerr << res << " ) = "; + return res; + } + + + template + double init (Iter begin, Iter end, Resp_t resp) + { + reset(); + return this->increment(begin, end); + + } + + + ArrayVector const & response() + { + return mean_; + } + + void reset() + { + mean_.init(0.0); + variance_.init(0.0); + count_ = 0; + } + }; + + +template +class RegressionForestCounter2 +{ +public: + typedef MultiArrayShape<2>::type Shp; + DataSource const & labels_; + ArrayVector mean_; + ArrayVector variance_; + ArrayVector tmp_; + size_t count_; + + template + RegressionForestCounter2(DataSource const & labels, + ProblemSpec const & ext_) + : + labels_(labels), + mean_(ext_.response_size_, 0.0), + variance_(ext_.response_size_, 0.0), + tmp_(ext_.response_size_), + count_(0) + {} + + template + double increment (Iter begin, Iter end) + { + for(Iter iter = begin; iter != end; ++iter) + { + ++count_; + for(int ii = 0; ii < mean_.size(); ++ii) + tmp_[ii] = labels_(*iter, ii) - mean_[ii]; + double f = 1.0 / count_, + f1 = 1.0 - f; + for(int ii = 0; ii < mean_.size(); ++ii) + mean_[ii] += f*tmp_[ii]; + for(int ii = 0; ii < mean_.size(); ++ii) + variance_[ii] += f1*sq(tmp_[ii]); + } + double res = std::accumulate(variance_.begin(), + variance_.end(), + 0.0, + std::plus()) + /((count_ == 1)? 1:(count_ -1)); + //std::cerr << res << " ) = "; + return res; + } + + template //This is BROKEN + double decrement (Iter begin, Iter end) + { + for(Iter iter = begin; iter != end; ++iter) + { + double f = 1.0 / count_, + f1 = 1.0 - f; + for(int ii = 0; ii < mean_.size(); ++ii) + mean_[ii] = (mean_[ii] - f*labels_(*iter,ii))/(1-f); + for(int ii = 0; ii < mean_.size(); ++ii) + variance_[ii] -= f1*sq(labels_(*iter,ii) - mean_[ii]); + --count_; + } + double res = std::accumulate(variance_.begin(), + variance_.end(), + 0.0, + std::plus()) + /((count_ == 1)? 1:(count_ -1)); + //std::cerr << "( " << res << " + "; + return res; + } + /* west's algorithm for incremental variance + // calculation + template + double increment (Iter begin, Iter end) + { + for(Iter iter = begin; iter != end; ++iter) + { + ++count_; + for(int ii = 0; ii < mean_.size(); ++ii) + tmp_[ii] = labels_(*iter, ii) - mean_[ii]; + double f = 1.0 / count_, + f1 = 1.0 - f; + for(int ii = 0; ii < mean_.size(); ++ii) + mean_[ii] += f*tmp_[ii]; + for(int ii = 0; ii < mean_.size(); ++ii) + variance_[ii] += f1*sq(tmp_[ii]); + } + return std::accumulate(variance_.begin(), + variance_.end(), + 0.0, + std::plus()) + /(count_ -1); + } + + template + double decrement (Iter begin, Iter end) + { + for(Iter iter = begin; iter != end; ++iter) + { + --count_; + for(int ii = 0; ii < mean_.size(); ++ii) + tmp_[ii] = labels_(*iter, ii) - mean_[ii]; + double f = 1.0 / count_, + f1 = 1.0 + f; + for(int ii = 0; ii < mean_.size(); ++ii) + mean_[ii] -= f*tmp_[ii]; + for(int ii = 0; ii < mean_.size(); ++ii) + variance_[ii] -= f1*sq(tmp_[ii]); + } + return std::accumulate(variance_.begin(), + variance_.end(), + 0.0, + std::plus()) + /(count_ -1); + }*/ + + template + double init (Iter begin, Iter end, Resp_t resp) + { + reset(); + return this->increment(begin, end, resp); + } + + + ArrayVector const & response() + { + return mean_; + } + + void reset() + { + mean_.init(0.0); + variance_.init(0.0); + count_ = 0; + } +}; + +template +struct LossTraits; + +struct LSQLoss +{}; + +template +struct LossTraits +{ + typedef ImpurityLoss type; +}; + +template +struct LossTraits +{ + typedef ImpurityLoss type; +}; + +template +struct LossTraits +{ + typedef RegressionForestCounter type; +}; + +/** Given a column, choose a split that minimizes some loss + */ +template +class BestGiniOfColumn +{ +public: + ArrayVector class_weights_; + ArrayVector bestCurrentCounts[2]; + double min_gini_; + std::ptrdiff_t min_index_; + double min_threshold_; + ProblemSpec<> ext_param_; + + BestGiniOfColumn() + {} + + template + BestGiniOfColumn(ProblemSpec const & ext) + : + class_weights_(ext.class_weights_), + ext_param_(ext) + { + bestCurrentCounts[0].resize(ext.class_count_); + bestCurrentCounts[1].resize(ext.class_count_); + } + template + void set_external_parameters(ProblemSpec const & ext) + { + class_weights_ = ext.class_weights_; + ext_param_ = ext; + bestCurrentCounts[0].resize(ext.class_count_); + bestCurrentCounts[1].resize(ext.class_count_); + } + /** calculate the best gini split along a Feature Column + * \param column the feature vector - has to support the [] operator + * \param labels the label vector + * \param begin + * \param end (in and out) + * begin and end iterators to the indices of the + * samples in the current region. + * the range begin - end is sorted by the column supplied + * during function execution. + * \param region_response + * ??? + * class histogram of the range. + * + * precondition: begin, end valid range, + * class_counts positive integer valued array with the + * class counts in the current range. + * labels.size() >= max(begin, end); + * postcondition: + * begin, end sorted by column given. + * min_gini_ contains the minimum gini found or + * NumericTraits::max if no split was found. + * min_index_ contains the splitting index in the range + * or invalid data if no split was found. + * BestCirremtcounts[0] and [1] contain the + * class histogram of the left and right region of + * the left and right regions. + */ + template< class DataSourceF_t, + class DataSource_t, + class I_Iter, + class Array> + void operator()(DataSourceF_t const & column, + DataSource_t const & labels, + I_Iter & begin, + I_Iter & end, + Array const & region_response) + { + std::sort(begin, end, + SortSamplesByDimensions(column, 0)); + typedef typename + LossTraits::type LineSearchLoss; + LineSearchLoss left(labels, ext_param_); //initialize left and right region + LineSearchLoss right(labels, ext_param_); + + + + min_gini_ = right.init(begin, end, region_response); + min_threshold_ = *begin; + min_index_ = 0; //the starting point where to split + DimensionNotEqual comp(column, 0); + + I_Iter iter = begin; + I_Iter next = std::adjacent_find(iter, end, comp); + //std::cerr << std::distance(begin, end) << std::endl; + while( next != end) + { + double lr = right.decrement(iter, next + 1); + double ll = left.increment(iter , next + 1); + double loss = lr +ll; + //std::cerr <> in; + } + + template + double loss_of_region(DataSource_t const & labels, + Iter & begin, + Iter & end, + Array const & region_response) const + { + typedef typename + LossTraits::type LineSearchLoss; + LineSearchLoss region_loss(labels, ext_param_); + return + region_loss.init(begin, end, region_response); + } + +}; + +namespace detail +{ + template + struct Correction + { + template + static void exec(Region & in, LabelT & labels) + {} + }; + + template<> + struct Correction + { + template + static void exec(Region & region, LabelT & labels) + { + if(std::accumulate(region.classCounts().begin(), + region.classCounts().end(), 0.0) != region.size()) + { + RandomForestClassCounter< LabelT, + ArrayVector > + counter(labels, region.classCounts()); + std::for_each( region.begin(), region.end(), counter); + region.classCountsIsValid = true; + } + } + }; +} + +/** Chooses mtry columns and applies ColumnDecisionFunctor to each of the + * columns. Then Chooses the column that is best + */ +template +class ThresholdSplit: public SplitBase +{ + public: + + + typedef SplitBase SB; + + ArrayVector splitColumns; + ColumnDecisionFunctor bgfunc; + + double region_gini_; + ArrayVector min_gini_; + ArrayVector min_indices_; + ArrayVector min_thresholds_; + + int bestSplitIndex; + + double minGini() const + { + return min_gini_[bestSplitIndex]; + } + int bestSplitColumn() const + { + return splitColumns[bestSplitIndex]; + } + double bestSplitThreshold() const + { + return min_thresholds_[bestSplitIndex]; + } + + template + void set_external_parameters(ProblemSpec const & in) + { + SB::set_external_parameters(in); + bgfunc.set_external_parameters( SB::ext_param_); + int featureCount_ = SB::ext_param_.column_count_; + splitColumns.resize(featureCount_); + for(int k=0; k + int findBestSplit(MultiArrayView<2, T, C> features, + MultiArrayView<2, T2, C2> labels, + Region & region, + ArrayVector& childRegions, + Random & randint) + { + + typedef typename Region::IndexIterator IndexIterator; + if(region.size() == 0) + { + std::cerr << "SplitFunctor::findBestSplit(): stackentry with 0 examples encountered\n" + "continuing learning process...."; + } + // calculate things that haven't been calculated yet. + detail::Correction::exec(region, labels); + + + // Is the region pure already? + region_gini_ = bgfunc.loss_of_region(labels, + region.begin(), + region.end(), + region.classCounts()); + if(region_gini_ <= SB::ext_param_.precision_) + return this->makeTerminalNode(features, labels, region, randint); + + // select columns to be tried. + for(int ii = 0; ii < SB::ext_param_.actual_mtry_; ++ii) + std::swap(splitColumns[ii], + splitColumns[ii+ randint(features.shape(1) - ii)]); + + // find the best gini index + bestSplitIndex = 0; + double current_min_gini = region_gini_; + int num2try = features.shape(1); + for(int k=0; kmakeTerminalNode(features, labels, region, randint); + + //create a Node for output + Node node(SB::t_data, SB::p_data); + SB::node_ = node; + node.threshold() = min_thresholds_[bestSplitIndex]; + node.column() = splitColumns[bestSplitIndex]; + + // partition the range according to the best dimension + SortSamplesByDimensions > + sorter(features, node.column(), node.threshold()); + IndexIterator bestSplit = + std::partition(region.begin(), region.end(), sorter); + // Save the ranges of the child stack entries. + childRegions[0].setRange( region.begin() , bestSplit ); + childRegions[0].rule = region.rule; + childRegions[0].rule.push_back(std::make_pair(1, 1.0)); + childRegions[1].setRange( bestSplit , region.end() ); + childRegions[1].rule = region.rule; + childRegions[1].rule.push_back(std::make_pair(1, 1.0)); + + return i_ThresholdNode; + } +}; + +typedef ThresholdSplit > GiniSplit; +typedef ThresholdSplit > EntropySplit; +typedef ThresholdSplit, RegressionTag> RegressionSplit; + +namespace rf +{ + +/** This namespace contains additional Splitfunctors. + * + * The Split functor classes are designed in a modular fashion because new split functors may + * share a lot of code with existing ones. + * + * ThresholdSplit implements the functionality needed for any split functor, that makes its + * decision via one dimensional axis-parallel cuts. The Template parameter defines how the split + * along one dimension is chosen. + * + * The BestGiniOfColumn class chooses a split that minimizes one of the Loss functions supplied + * (GiniCriterion for classification and LSQLoss for regression). Median chooses the Split in a + * kD tree fashion. + * + * + * Currently defined typedefs: + * \code + * typedef ThresholdSplit > GiniSplit; + * typedef ThresholdSplit, RegressionTag> RegressionSplit; + * typedef ThresholdSplit MedianSplit; + * \endcode + */ +namespace split +{ + +/** This Functor chooses the median value of a column + */ +class Median +{ +public: + + typedef GiniCriterion LineSearchLossTag; + ArrayVector class_weights_; + ArrayVector bestCurrentCounts[2]; + double min_gini_; + std::ptrdiff_t min_index_; + double min_threshold_; + ProblemSpec<> ext_param_; + + Median() + {} + + template + Median(ProblemSpec const & ext) + : + class_weights_(ext.class_weights_), + ext_param_(ext) + { + bestCurrentCounts[0].resize(ext.class_count_); + bestCurrentCounts[1].resize(ext.class_count_); + } + + template + void set_external_parameters(ProblemSpec const & ext) + { + class_weights_ = ext.class_weights_; + ext_param_ = ext; + bestCurrentCounts[0].resize(ext.class_count_); + bestCurrentCounts[1].resize(ext.class_count_); + } + + template< class DataSourceF_t, + class DataSource_t, + class I_Iter, + class Array> + void operator()(DataSourceF_t const & column, + DataSource_t const & labels, + I_Iter & begin, + I_Iter & end, + Array const & region_response) + { + std::sort(begin, end, + SortSamplesByDimensions(column, 0)); + typedef typename + LossTraits::type LineSearchLoss; + LineSearchLoss left(labels, ext_param_); + LineSearchLoss right(labels, ext_param_); + right.init(begin, end, region_response); + + min_gini_ = NumericTraits::max(); + min_index_ = floor(double(end - begin)/2.0); + min_threshold_ = column[*(begin + min_index_)]; + SortSamplesByDimensions + sorter(column, 0, min_threshold_); + I_Iter part = std::partition(begin, end, sorter); + DimensionNotEqual comp(column, 0); + if(part == begin) + { + part= std::adjacent_find(part, end, comp)+1; + + } + if(part >= end) + { + return; + } + else + { + min_threshold_ = column[*part]; + } + min_gini_ = right.decrement(begin, part) + + left.increment(begin , part); + + bestCurrentCounts[0] = left.response(); + bestCurrentCounts[1] = right.response(); + + min_index_ = part - begin; + } + + template + double loss_of_region(DataSource_t const & labels, + Iter & begin, + Iter & end, + Array const & region_response) const + { + typedef typename + LossTraits::type LineSearchLoss; + LineSearchLoss region_loss(labels, ext_param_); + return + region_loss.init(begin, end, region_response); + } + +}; + +typedef ThresholdSplit MedianSplit; + + +/** This Functor chooses a random value of a column + */ +class RandomSplitOfColumn +{ +public: + + typedef GiniCriterion LineSearchLossTag; + ArrayVector class_weights_; + ArrayVector bestCurrentCounts[2]; + double min_gini_; + std::ptrdiff_t min_index_; + double min_threshold_; + ProblemSpec<> ext_param_; + typedef RandomMT19937 Random_t; + Random_t random; + + RandomSplitOfColumn() + {} + + template + RandomSplitOfColumn(ProblemSpec const & ext) + : + class_weights_(ext.class_weights_), + ext_param_(ext), + random(RandomSeed) + { + bestCurrentCounts[0].resize(ext.class_count_); + bestCurrentCounts[1].resize(ext.class_count_); + } + + template + RandomSplitOfColumn(ProblemSpec const & ext, Random_t & random_) + : + class_weights_(ext.class_weights_), + ext_param_(ext), + random(random_) + { + bestCurrentCounts[0].resize(ext.class_count_); + bestCurrentCounts[1].resize(ext.class_count_); + } + + template + void set_external_parameters(ProblemSpec const & ext) + { + class_weights_ = ext.class_weights_; + ext_param_ = ext; + bestCurrentCounts[0].resize(ext.class_count_); + bestCurrentCounts[1].resize(ext.class_count_); + } + + template< class DataSourceF_t, + class DataSource_t, + class I_Iter, + class Array> + void operator()(DataSourceF_t const & column, + DataSource_t const & labels, + I_Iter & begin, + I_Iter & end, + Array const & region_response) + { + std::sort(begin, end, + SortSamplesByDimensions(column, 0)); + typedef typename + LossTraits::type LineSearchLoss; + LineSearchLoss left(labels, ext_param_); + LineSearchLoss right(labels, ext_param_); + right.init(begin, end, region_response); + + + min_gini_ = NumericTraits::max(); + int tmp_pt = random.uniformInt(std::distance(begin, end)); + min_index_ = tmp_pt; + min_threshold_ = column[*(begin + min_index_)]; + SortSamplesByDimensions + sorter(column, 0, min_threshold_); + I_Iter part = std::partition(begin, end, sorter); + DimensionNotEqual comp(column, 0); + if(part == begin) + { + part= std::adjacent_find(part, end, comp)+1; + + } + if(part >= end) + { + return; + } + else + { + min_threshold_ = column[*part]; + } + min_gini_ = right.decrement(begin, part) + + left.increment(begin , part); + + bestCurrentCounts[0] = left.response(); + bestCurrentCounts[1] = right.response(); + + min_index_ = part - begin; + } + + template + double loss_of_region(DataSource_t const & labels, + Iter & begin, + Iter & end, + Array const & region_response) const + { + typedef typename + LossTraits::type LineSearchLoss; + LineSearchLoss region_loss(labels, ext_param_); + return + region_loss.init(begin, end, region_response); + } + +}; + +typedef ThresholdSplit RandomSplit; +} +} + + +} //namespace vigra +#endif // VIGRA_RANDOM_FOREST_SPLIT_HXX diff -urNb vigra-Version-1-10-0/include/vigra/random_forest.hxx Vigra-src/include/vigra/random_forest.hxx --- vigra-Version-1-10-0/include/vigra/random_forest.hxx 2013-11-18 17:48:16.000000000 +0100 +++ Vigra-src/include/vigra/random_forest.hxx 2014-01-23 13:23:00.000000000 +0100 @@ -584,6 +584,7 @@ { vigra_precondition(features.shape(0) == labels.shape(0), "RandomForest::predictLabels(): Label array has wrong size."); +#pragma omp parallel for for(int k=0; k currentRow(rowVector(features, row));