diff --git a/CMakeExternals/PatchVigra-1.10.0.cmake b/CMakeExternals/PatchVigra-1.10.0.cmake deleted file mode 100644 index f60e79455d..0000000000 --- a/CMakeExternals/PatchVigra-1.10.0.cmake +++ /dev/null @@ -1,19 +0,0 @@ -# Called by Vigra.cmake (ExternalProject_Add) as a patch for Vigra. - -# read whole file -file(STRINGS CMakeLists.txt sourceCode NEWLINE_CONSUME) - -# Changing the way hdf5 is found. The Vigra FindHDF5.cmake file makes -# too many assumptions about the build structure of HDF5. Instead, we -# just use the hdf5-config.cmake file from our HDF5 install tree. -string(REPLACE "VIGRA_FIND_PACKAGE(HDF5)" "FIND_PACKAGE(HDF5 PATHS \"${HDF5_DIR}\" PATH_SUFFIXES hdf5 NO_DEFAULT_PATH NO_MODULE)\nIF(NOT HDF5_FOUND)\n VIGRA_FIND_PACKAGE(HDF5)\nENDIF()" sourceCode ${sourceCode}) - -# set variable CONTENTS, which is substituted in TEMPLATE_FILE -set(CONTENTS ${sourceCode}) -configure_file(${TEMPLATE_FILE} CMakeLists.txt @ONLY) - -# patch the VigraConfig.cmake.in file -file(STRINGS config/VigraConfig.cmake.in sourceCode NEWLINE_CONSUME) -string(REPLACE "include(\${SELF_DIR}/vigra-targets.cmake)" "if(NOT TARGET vigraimpex)\n include(\${SELF_DIR}/vigra-targets.cmake)\nendif()" sourceCode ${sourceCode}) -set(CONTENTS ${sourceCode}) -configure_file(${TEMPLATE_FILE} config/VigraConfig.cmake.in @ONLY) diff --git a/CMakeExternals/Vigra.cmake b/CMakeExternals/Vigra.cmake index ccde21e12a..1db5f47de4 100644 --- a/CMakeExternals/Vigra.cmake +++ b/CMakeExternals/Vigra.cmake @@ -1,46 +1,43 @@ #----------------------------------------------------------------------------- # VIGRA #----------------------------------------------------------------------------- if(MITK_USE_Vigra) # Sanity checks if(DEFINED Vigra_DIR AND NOT EXISTS ${Vigra_DIR}) message(FATAL_ERROR "Vigra_DIR variable is defined but corresponds to non-existing directory") endif() if(NOT ${MITK_USE_HDF5}) message(FATAL_ERROR "HDF5 is required for Vigra. Please enable it.") endif() set(proj Vigra) set(proj_DEPENDENCIES HDF5) set(Vigra_DEPENDS ${proj}) if(NOT DEFINED Vigra_DIR) - - set(patch_cmd ${CMAKE_COMMAND} -DTEMPLATE_FILE:FILEPATH=${CMAKE_CURRENT_LIST_DIR}/EmptyFileForPatching.dummy -DHDF5_DIR:PATH=${HDF5_DIR} -P ${CMAKE_CURRENT_LIST_DIR}/PatchVigra-1.10.0.cmake) - ExternalProject_Add(${proj} URL ${MITK_THIRDPARTY_DOWNLOAD_PREFIX_URL}/vigra-1.10.0-src.tar.gz URL_MD5 4f963f0be4fcb8b06271c2aa40baa9be - PATCH_COMMAND ${patch_cmd} + PATCH_COMMAND ${PATCH_COMMAND} -N -p1 -i ${CMAKE_CURRENT_LIST_DIR}/Vigra.patch CMAKE_GENERATOR ${gen} CMAKE_ARGS ${ep_common_args} -DAUTOEXEC_TESTS:BOOL=OFF -DWITH_VIGRANUMPY:BOOL=OFF -DHDF5_DIR:PATH=${HDF5_DIR} -DCMAKE_INSTALL_RPATH_USE_LINK_PATH:BOOL=TRUE -DCMAKE_INSTALL_PREFIX:PATH= DEPENDS ${proj_DEPENDENCIES} ) set(Vigra_DIR ${ep_prefix}) else() mitkMacroEmptyExternalProject(${proj} "${proj_DEPENDENCIES}") endif() endif(MITK_USE_Vigra) diff --git a/CMakeExternals/Vigra.patch b/CMakeExternals/Vigra.patch new file mode 100644 index 0000000000..19a651d393 --- /dev/null +++ b/CMakeExternals/Vigra.patch @@ -0,0 +1,1519 @@ +diff -urNb vigra-Version-1-10-0/CMakeLists.txt Vigra-src/CMakeLists.txt +--- vigra-Version-1-10-0/CMakeLists.txt 2013-11-18 17:48:16.000000000 +0100 ++++ Vigra-src/CMakeLists.txt 2015-02-24 14:31:03.716528000 +0100 +@@ -70,8 +70,11 @@ + ENDIF() + + IF(WITH_HDF5) ++ FIND_PACKAGE(HDF5 PATHS "/local/mitk/release-binary/MITK-superbuild/HDF5-install" PATH_SUFFIXES hdf5 NO_DEFAULT_PATH NO_MODULE) ++IF(NOT HDF5_FOUND) + VIGRA_FIND_PACKAGE(HDF5) + ENDIF() ++ENDIF() + + IF(WITH_BOOST_GRAPH) + IF(WITH_VIGRANUMPY) +@@ -395,3 +398,4 @@ + ENDIF() + + MESSAGE( STATUS "---------------------------------------------------------" ) ++ +diff -urNb vigra-Version-1-10-0/config/VigraConfig.cmake.in Vigra-src/config/VigraConfig.cmake.in +--- vigra-Version-1-10-0/config/VigraConfig.cmake.in 2013-11-18 17:48:16.000000000 +0100 ++++ Vigra-src/config/VigraConfig.cmake.in 2015-02-24 14:31:03.716528000 +0100 +@@ -1,7 +1,9 @@ + get_filename_component(SELF_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) + get_filename_component(Vigra_TOP_DIR "${SELF_DIR}/../../" ABSOLUTE) + +-include(${SELF_DIR}/vigra-targets.cmake) ++if(NOT TARGET vigraimpex) ++ include(${SELF_DIR}/vigra-targets.cmake) ++endif() + get_target_property(VIGRA_TYPE vigraimpex TYPE) + if(${VIGRA_TYPE} STREQUAL "STATIC_LIBRARY") + SET(VIGRA_STATIC_LIB True) +@@ -12,3 +14,4 @@ + IF(EXISTS ${SELF_DIR}/../vigranumpy/VigranumpyConfig.cmake) + INCLUDE(${SELF_DIR}/../vigranumpy/VigranumpyConfig.cmake) + ENDIF() ++ +diff -urNb vigra-Version-1-10-0/include/vigra/random_forest/rf_decisionTree.hxx Vigra-src/include/vigra/random_forest/rf_decisionTree.hxx +--- vigra-Version-1-10-0/include/vigra/random_forest/rf_decisionTree.hxx 2013-11-18 17:48:16.000000000 +0100 ++++ Vigra-src/include/vigra/random_forest/rf_decisionTree.hxx 2014-01-16 16:26:12.000000000 +0100 +@@ -90,6 +90,8 @@ + ProblemSpec<> ext_param_; + unsigned int classCount_; + ++ std::map m_Parents; ++ + + public: + /* \brief Create tree with parameters */ +@@ -350,6 +352,25 @@ + continueLearn(features,labels,stack_entry,split,stop,visitor,randint); + } + ++template < class TRandomForest> ++int GetTreeDepthForNode(int nodeIndex, TRandomForest* rf) ++{ ++ int depth = 0; ++ while (true) ++ { ++ if (nodeIndex < 1) ++ { ++ break; ++ } ++ ++depth; ++ nodeIndex = rf->m_Parents[nodeIndex]; ++ } ++ return depth; ++} ++ ++ ++ ++ + template < class U, class C, + class U2, class C2, + class StackEntry_t, +@@ -391,8 +412,13 @@ + //produce a Terminal Node or the Split itself decides what + //kind of node to make + TreeInt NodeID; ++ int currentDepthLevel; ++ if (top.leftParent != StackEntry_t::DecisionTreeNoParent) ++ currentDepthLevel = GetTreeDepthForNode(top.leftParent, this); ++ else ++ currentDepthLevel = GetTreeDepthForNode(top.rightParent, this); + +- if(stop(top)) ++ if(stop(top) || (currentDepthLevel >= split.GetMaximumTreeDepth())) + NodeID = split.makeTerminalNode(features, + labels, + top, +@@ -421,17 +447,20 @@ + // Using InteriorNodeBase because exact parameter form not needed. + // look at the Node base before getting scared. + last_node_pos = topology_.size(); ++ m_Parents[last_node_pos] = StackEntry_t::DecisionTreeNoParent; + if(top.leftParent != StackEntry_t::DecisionTreeNoParent) + { + NodeBase(topology_, + parameters_, + top.leftParent).child(0) = last_node_pos; ++ m_Parents[last_node_pos] = top.leftParent; + } + else if(top.rightParent != StackEntry_t::DecisionTreeNoParent) + { + NodeBase(topology_, + parameters_, + top.rightParent).child(1) = last_node_pos; ++ m_Parents[last_node_pos] = top.rightParent; + } + + +diff -urNb vigra-Version-1-10-0/include/vigra/random_forest/rf_split-old.hxx Vigra-src/include/vigra/random_forest/rf_split-old.hxx +--- vigra-Version-1-10-0/include/vigra/random_forest/rf_split-old.hxx 1970-01-01 01:00:00.000000000 +0100 ++++ Vigra-src/include/vigra/random_forest/rf_split-old.hxx 2014-03-31 09:46:18.000000000 +0200 +@@ -0,0 +1,1383 @@ ++/************************************************************************/ ++/* */ ++/* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */ ++/* */ ++/* This file is part of the VIGRA computer vision library. */ ++/* The VIGRA Website is */ ++/* http://hci.iwr.uni-heidelberg.de/vigra/ */ ++/* Please direct questions, bug reports, and contributions to */ ++/* ullrich.koethe@iwr.uni-heidelberg.de or */ ++/* vigra@informatik.uni-hamburg.de */ ++/* */ ++/* Permission is hereby granted, free of charge, to any person */ ++/* obtaining a copy of this software and associated documentation */ ++/* files (the "Software"), to deal in the Software without */ ++/* restriction, including without limitation the rights to use, */ ++/* copy, modify, merge, publish, distribute, sublicense, and/or */ ++/* sell copies of the Software, and to permit persons to whom the */ ++/* Software is furnished to do so, subject to the following */ ++/* conditions: */ ++/* */ ++/* The above copyright notice and this permission notice shall be */ ++/* included in all copies or substantial portions of the */ ++/* Software. */ ++/* */ ++/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */ ++/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */ ++/* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */ ++/* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */ ++/* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */ ++/* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */ ++/* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */ ++/* OTHER DEALINGS IN THE SOFTWARE. */ ++/* */ ++/************************************************************************/ ++#ifndef VIGRA_RANDOM_FOREST_SPLIT_HXX ++#define VIGRA_RANDOM_FOREST_SPLIT_HXX ++#include ++#include ++#include ++#include ++#include ++#include "../mathutil.hxx" ++#include "../array_vector.hxx" ++#include "../sized_int.hxx" ++#include "../matrix.hxx" ++#include "../random.hxx" ++#include "../functorexpression.hxx" ++#include "rf_nodeproxy.hxx" ++//#include "rf_sampling.hxx" ++#include "rf_region.hxx" ++//#include "../hokashyap.hxx" ++//#include "vigra/rf_helpers.hxx" ++ ++namespace vigra ++{ ++ ++// Incomplete Class to ensure that findBestSplit is always implemented in ++// the derived classes of SplitBase ++class CompileTimeError; ++ ++ ++namespace detail ++{ ++ template ++ class Normalise ++ { ++ public: ++ template ++ static void exec(Iter begin, Iter end) ++ {} ++ }; ++ ++ template<> ++ class Normalise ++ { ++ public: ++ template ++ static void exec (Iter begin, Iter end) ++ { ++ double bla = std::accumulate(begin, end, 0.0); ++ for(int ii = 0; ii < end - begin; ++ii) ++ begin[ii] = begin[ii]/bla ; ++ } ++ }; ++} ++ ++ ++/** Base Class for all SplitFunctors used with the \ref RandomForest class ++ defines the interface used while learning a tree. ++**/ ++template ++class SplitBase ++{ ++ public: ++ ++ typedef Tag RF_Tag; ++ typedef DT_StackEntry::iterator> ++ StackEntry_t; ++ ++ ProblemSpec<> ext_param_; ++ ++ NodeBase::T_Container_type t_data; ++ NodeBase::P_Container_type p_data; ++ ++ NodeBase node_; ++ ++ /** returns the DecisionTree Node created by ++ \ref SplitBase::findBestSplit() or \ref SplitBase::makeTerminalNode(). ++ **/ ++ ++ template ++ void set_external_parameters(ProblemSpec const & in) ++ { ++ ext_param_ = in; ++ t_data.push_back(in.column_count_); ++ t_data.push_back(in.class_count_); ++ } ++ ++ NodeBase & createNode() ++ { ++ return node_; ++ } ++ ++ int classCount() const ++ { ++ return int(t_data[1]); ++ } ++ ++ int featureCount() const ++ { ++ return int(t_data[0]); ++ } ++ ++ /** resets internal data. Should always be called before ++ calling findBestSplit or makeTerminalNode ++ **/ ++ void reset() ++ { ++ t_data.resize(2); ++ p_data.resize(0); ++ } ++ ++ ++ /** findBestSplit has to be re-implemented in derived split functor. ++ The defaut implementation only insures that a CompileTime error is issued ++ if no such method was defined. ++ **/ ++ ++ template ++ int findBestSplit(MultiArrayView<2, T, C> features, ++ MultiArrayView<2, T2, C2> labels, ++ Region region, ++ ArrayVector childs, ++ Random randint) ++ { ++#ifndef __clang__ ++ // FIXME: This compile-time checking trick does not work for clang. ++ CompileTimeError SplitFunctor__findBestSplit_member_was_not_defined; ++#endif ++ return 0; ++ } ++ ++ /** Default action for creating a terminal Node. ++ sets the Class probability of the remaining region according to ++ the class histogram ++ **/ ++ template ++ int makeTerminalNode(MultiArrayView<2, T, C> features, ++ MultiArrayView<2, T2, C2> labels, ++ Region & region, ++ Random randint) ++ { ++ Node ret(t_data, p_data); ++ node_ = ret; ++ if(ext_param_.class_weights_.size() != region.classCounts().size()) ++ { ++ std::copy(region.classCounts().begin(), ++ region.classCounts().end(), ++ ret.prob_begin()); ++ } ++ else ++ { ++ std::transform(region.classCounts().begin(), ++ region.classCounts().end(), ++ ext_param_.class_weights_.begin(), ++ ret.prob_begin(), std::multiplies()); ++ } ++ detail::Normalise::exec(ret.prob_begin(), ret.prob_end()); ++// std::copy(ret.prob_begin(), ret.prob_end(), std::ostream_iterator(std::cerr, ", " )); ++// std::cerr << std::endl; ++ ret.weights() = region.size(); ++ return e_ConstProbNode; ++ } ++ ++ ++}; ++ ++/** Functor to sort the indices of a feature Matrix by a certain dimension ++**/ ++template ++class SortSamplesByDimensions ++{ ++ DataMatrix const & data_; ++ MultiArrayIndex sortColumn_; ++ double thresVal_; ++ public: ++ ++ SortSamplesByDimensions(DataMatrix const & data, ++ MultiArrayIndex sortColumn, ++ double thresVal = 0.0) ++ : data_(data), ++ sortColumn_(sortColumn), ++ thresVal_(thresVal) ++ {} ++ ++ void setColumn(MultiArrayIndex sortColumn) ++ { ++ sortColumn_ = sortColumn; ++ } ++ void setThreshold(double value) ++ { ++ thresVal_ = value; ++ } ++ ++ bool operator()(MultiArrayIndex l, MultiArrayIndex r) const ++ { ++ return data_(l, sortColumn_) < data_(r, sortColumn_); ++ } ++ bool operator()(MultiArrayIndex l) const ++ { ++ return data_(l, sortColumn_) < thresVal_; ++ } ++}; ++ ++template ++class DimensionNotEqual ++{ ++ DataMatrix const & data_; ++ MultiArrayIndex sortColumn_; ++ ++ public: ++ ++ DimensionNotEqual(DataMatrix const & data, ++ MultiArrayIndex sortColumn) ++ : data_(data), ++ sortColumn_(sortColumn) ++ {} ++ ++ void setColumn(MultiArrayIndex sortColumn) ++ { ++ sortColumn_ = sortColumn; ++ } ++ ++ bool operator()(MultiArrayIndex l, MultiArrayIndex r) const ++ { ++ return data_(l, sortColumn_) != data_(r, sortColumn_); ++ } ++}; ++ ++template ++class SortSamplesByHyperplane ++{ ++ DataMatrix const & data_; ++ Node const & node_; ++ ++ public: ++ ++ SortSamplesByHyperplane(DataMatrix const & data, ++ Node const & node) ++ : ++ data_(data), ++ node_(node) ++ {} ++ ++ /** calculate the distance of a sample point to a hyperplane ++ */ ++ double operator[](MultiArrayIndex l) const ++ { ++ double result_l = -1 * node_.intercept(); ++ for(int ii = 0; ii < node_.columns_size(); ++ii) ++ { ++ result_l += rowVector(data_, l)[node_.columns_begin()[ii]] ++ * node_.weights()[ii]; ++ } ++ return result_l; ++ } ++ ++ bool operator()(MultiArrayIndex l, MultiArrayIndex r) const ++ { ++ return (*this)[l] < (*this)[r]; ++ } ++ ++}; ++ ++/** makes a Class Histogram given indices in a labels_ array ++ * usage: ++ * MultiArrayView<2, T2, C2> labels = makeSomeLabels() ++ * ArrayVector hist(numberOfLabels(labels), 0); ++ * RandomForestClassCounter counter(labels, hist); ++ * ++ * Container indices = getSomeIndices() ++ * std::for_each(indices, counter); ++ */ ++template ++class RandomForestClassCounter ++{ ++ DataSource const & labels_; ++ CountArray & counts_; ++ ++ public: ++ ++ RandomForestClassCounter(DataSource const & labels, ++ CountArray & counts) ++ : labels_(labels), ++ counts_(counts) ++ { ++ reset(); ++ } ++ ++ void reset() ++ { ++ counts_.init(0); ++ } ++ ++ void operator()(MultiArrayIndex l) const ++ { ++ counts_[labels_[l]] +=1; ++ } ++}; ++ ++ ++/** Functor To Calculate the Best possible Split Based on the Gini Index ++ given Labels and Features along a given Axis ++*/ ++ ++namespace detail ++{ ++ template ++ class ConstArr ++ { ++ public: ++ double operator[](size_t) const ++ { ++ return (double)N; ++ } ++ }; ++ ++ ++} ++ ++ ++ ++ ++/** Functor to calculate the entropy based impurity ++ */ ++class EntropyCriterion ++{ ++public: ++ /**calculate the weighted gini impurity based on class histogram ++ * and class weights ++ */ ++ template ++ double operator() (Array const & hist, ++ Array2 const & weights, ++ double total = 1.0) const ++ { ++ return impurity(hist, weights, total); ++ } ++ ++ /** calculate the gini based impurity based on class histogram ++ */ ++ template ++ double operator()(Array const & hist, double total = 1.0) const ++ { ++ return impurity(hist, total); ++ } ++ ++ /** static version of operator(hist total) ++ */ ++ template ++ static double impurity(Array const & hist, double total) ++ { ++ return impurity(hist, detail::ConstArr<1>(), total); ++ } ++ ++ /** static version of operator(hist, weights, total) ++ */ ++ template ++ static double impurity (Array const & hist, ++ Array2 const & weights, ++ double total) ++ { ++ ++ int class_count = hist.size(); ++ double entropy = 0.0; ++ if(class_count == 2) ++ { ++ double p0 = (hist[0]/total); ++ double p1 = (hist[1]/total); ++ entropy = 0 - weights[0]*p0*std::log(p0) - weights[1]*p1*std::log(p1); ++ } ++ else ++ { ++ for(int ii = 0; ii < class_count; ++ii) ++ { ++ double w = weights[ii]; ++ double pii = hist[ii]/total; ++ entropy -= w*( pii*std::log(pii)); ++ } ++ } ++ entropy = total * entropy; ++ return entropy; ++ } ++}; ++ ++/** Functor to calculate the gini impurity ++ */ ++class GiniCriterion ++{ ++public: ++ /**calculate the weighted gini impurity based on class histogram ++ * and class weights ++ */ ++ template ++ double operator() (Array const & hist, ++ Array2 const & weights, ++ double total = 1.0) const ++ { ++ return impurity(hist, weights, total); ++ } ++ ++ /** calculate the gini based impurity based on class histogram ++ */ ++ template ++ double operator()(Array const & hist, double total = 1.0) const ++ { ++ return impurity(hist, total); ++ } ++ ++ /** static version of operator(hist total) ++ */ ++ template ++ static double impurity(Array const & hist, double total) ++ { ++ return impurity(hist, detail::ConstArr<1>(), total); ++ } ++ ++ /** static version of operator(hist, weights, total) ++ */ ++ template ++ static double impurity (Array const & hist, ++ Array2 const & weights, ++ double total) ++ { ++ ++ int class_count = hist.size(); ++ double gini = 0.0; ++ if(class_count == 2) ++ { ++ double w = weights[0] * weights[1]; ++ gini = w * (hist[0] * hist[1] / total); ++ } ++ else ++ { ++ // Gini Impurity is defined as ++ // gini = sum_features-i ( p(i) * (1 - p(i) )) ++ for(int ii = 0; ii < class_count; ++ii) ++ { ++ double w = weights[ii]; ++ gini += w*( hist[ii]*( 1.0 - w * hist[ii]/total ) ); ++ } ++ gini /= total; ++ } ++ return gini; ++ } ++}; ++ ++ ++template ++class ImpurityLoss ++{ ++ ++ DataSource const & labels_; ++ ArrayVector counts_; ++ ArrayVector const class_weights_; ++ double total_counts_; ++ Impurity impurity_; ++ ++ public: ++ ++ template ++ ImpurityLoss(DataSource const & labels, ++ ProblemSpec const & ext_) ++ : labels_(labels), ++ counts_(ext_.class_count_, 0.0), ++ class_weights_(ext_.class_weights_), ++ total_counts_(0.0) ++ {} ++ ++ void reset() ++ { ++ counts_.init(0); ++ total_counts_ = 0.0; ++ } ++ ++ template ++ double increment_histogram(Counts const & counts) ++ { ++ std::transform(counts.begin(), counts.end(), ++ counts_.begin(), counts_.begin(), ++ std::plus()); ++ total_counts_ = std::accumulate( counts_.begin(), ++ counts_.end(), ++ 0.0); ++ return impurity_(counts_, class_weights_, total_counts_); ++ } ++ ++ template ++ double decrement_histogram(Counts const & counts) ++ { ++ std::transform(counts.begin(), counts.end(), ++ counts_.begin(), counts_.begin(), ++ std::minus()); ++ total_counts_ = std::accumulate( counts_.begin(), ++ counts_.end(), ++ 0.0); ++ return impurity_(counts_, class_weights_, total_counts_); ++ } ++ ++ template ++ double increment(Iter begin, Iter end) ++ { ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ counts_[labels_(*iter, 0)] +=1.0; ++ total_counts_ +=1.0; ++ } ++ return impurity_(counts_, class_weights_, total_counts_); ++ } ++ ++ template ++ double decrement(Iter const & begin, Iter const & end) ++ { ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ counts_[labels_(*iter,0)] -=1.0; ++ total_counts_ -=1.0; ++ } ++ return impurity_(counts_, class_weights_, total_counts_); ++ } ++ ++ template ++ double init (Iter begin, Iter end, Resp_t resp) ++ { ++ reset(); ++ std::copy(resp.begin(), resp.end(), counts_.begin()); ++ total_counts_ = std::accumulate(counts_.begin(), counts_.end(), 0.0); ++ return impurity_(counts_,class_weights_, total_counts_); ++ } ++ ++ ArrayVector const & response() ++ { ++ return counts_; ++ } ++}; ++ ++ ++ ++ template ++ class RegressionForestCounter ++ { ++ public: ++ typedef MultiArrayShape<2>::type Shp; ++ DataSource const & labels_; ++ ArrayVector mean_; ++ ArrayVector variance_; ++ ArrayVector tmp_; ++ size_t count_; ++ int* end_; ++ ++ template ++ RegressionForestCounter(DataSource const & labels, ++ ProblemSpec const & ext_) ++ : ++ labels_(labels), ++ mean_(ext_.response_size_, 0.0), ++ variance_(ext_.response_size_, 0.0), ++ tmp_(ext_.response_size_), ++ count_(0) ++ {} ++ ++ template ++ double increment (Iter begin, Iter end) ++ { ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ ++count_; ++ for(unsigned int ii = 0; ii < mean_.size(); ++ii) ++ tmp_[ii] = labels_(*iter, ii) - mean_[ii]; ++ double f = 1.0 / count_, ++ f1 = 1.0 - f; ++ for(unsigned int ii = 0; ii < mean_.size(); ++ii) ++ mean_[ii] += f*tmp_[ii]; ++ for(unsigned int ii = 0; ii < mean_.size(); ++ii) ++ variance_[ii] += f1*sq(tmp_[ii]); ++ } ++ double res = std::accumulate(variance_.begin(), ++ variance_.end(), ++ 0.0, ++ std::plus()); ++ //std::cerr << res << " ) = "; ++ return res; ++ } ++ ++ template //This is BROKEN ++ double decrement (Iter begin, Iter end) ++ { ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ --count_; ++ } ++ ++ begin = end; ++ end = end + count_; ++ ++ ++ for(unsigned int ii = 0; ii < mean_.size(); ++ii) ++ { ++ mean_[ii] = 0; ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ mean_[ii] += labels_(*iter, ii); ++ } ++ mean_[ii] /= count_; ++ variance_[ii] = 0; ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ variance_[ii] += (labels_(*iter, ii) - mean_[ii])*(labels_(*iter, ii) - mean_[ii]); ++ } ++ } ++ double res = std::accumulate(variance_.begin(), ++ variance_.end(), ++ 0.0, ++ std::plus()); ++ //std::cerr << res << " ) = "; ++ return res; ++ } ++ ++ ++ template ++ double init (Iter begin, Iter end, Resp_t resp) ++ { ++ reset(); ++ return this->increment(begin, end); ++ ++ } ++ ++ ++ ArrayVector const & response() ++ { ++ return mean_; ++ } ++ ++ void reset() ++ { ++ mean_.init(0.0); ++ variance_.init(0.0); ++ count_ = 0; ++ } ++ }; ++ ++ ++template ++class RegressionForestCounter2 ++{ ++public: ++ typedef MultiArrayShape<2>::type Shp; ++ DataSource const & labels_; ++ ArrayVector mean_; ++ ArrayVector variance_; ++ ArrayVector tmp_; ++ size_t count_; ++ ++ template ++ RegressionForestCounter2(DataSource const & labels, ++ ProblemSpec const & ext_) ++ : ++ labels_(labels), ++ mean_(ext_.response_size_, 0.0), ++ variance_(ext_.response_size_, 0.0), ++ tmp_(ext_.response_size_), ++ count_(0) ++ {} ++ ++ template ++ double increment (Iter begin, Iter end) ++ { ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ ++count_; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ tmp_[ii] = labels_(*iter, ii) - mean_[ii]; ++ double f = 1.0 / count_, ++ f1 = 1.0 - f; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ mean_[ii] += f*tmp_[ii]; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ variance_[ii] += f1*sq(tmp_[ii]); ++ } ++ double res = std::accumulate(variance_.begin(), ++ variance_.end(), ++ 0.0, ++ std::plus()) ++ /((count_ == 1)? 1:(count_ -1)); ++ //std::cerr << res << " ) = "; ++ return res; ++ } ++ ++ template //This is BROKEN ++ double decrement (Iter begin, Iter end) ++ { ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ double f = 1.0 / count_, ++ f1 = 1.0 - f; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ mean_[ii] = (mean_[ii] - f*labels_(*iter,ii))/(1-f); ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ variance_[ii] -= f1*sq(labels_(*iter,ii) - mean_[ii]); ++ --count_; ++ } ++ double res = std::accumulate(variance_.begin(), ++ variance_.end(), ++ 0.0, ++ std::plus()) ++ /((count_ == 1)? 1:(count_ -1)); ++ //std::cerr << "( " << res << " + "; ++ return res; ++ } ++ /* west's algorithm for incremental variance ++ // calculation ++ template ++ double increment (Iter begin, Iter end) ++ { ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ ++count_; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ tmp_[ii] = labels_(*iter, ii) - mean_[ii]; ++ double f = 1.0 / count_, ++ f1 = 1.0 - f; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ mean_[ii] += f*tmp_[ii]; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ variance_[ii] += f1*sq(tmp_[ii]); ++ } ++ return std::accumulate(variance_.begin(), ++ variance_.end(), ++ 0.0, ++ std::plus()) ++ /(count_ -1); ++ } ++ ++ template ++ double decrement (Iter begin, Iter end) ++ { ++ for(Iter iter = begin; iter != end; ++iter) ++ { ++ --count_; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ tmp_[ii] = labels_(*iter, ii) - mean_[ii]; ++ double f = 1.0 / count_, ++ f1 = 1.0 + f; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ mean_[ii] -= f*tmp_[ii]; ++ for(int ii = 0; ii < mean_.size(); ++ii) ++ variance_[ii] -= f1*sq(tmp_[ii]); ++ } ++ return std::accumulate(variance_.begin(), ++ variance_.end(), ++ 0.0, ++ std::plus()) ++ /(count_ -1); ++ }*/ ++ ++ template ++ double init (Iter begin, Iter end, Resp_t resp) ++ { ++ reset(); ++ return this->increment(begin, end, resp); ++ } ++ ++ ++ ArrayVector const & response() ++ { ++ return mean_; ++ } ++ ++ void reset() ++ { ++ mean_.init(0.0); ++ variance_.init(0.0); ++ count_ = 0; ++ } ++}; ++ ++template ++struct LossTraits; ++ ++struct LSQLoss ++{}; ++ ++template ++struct LossTraits ++{ ++ typedef ImpurityLoss type; ++}; ++ ++template ++struct LossTraits ++{ ++ typedef ImpurityLoss type; ++}; ++ ++template ++struct LossTraits ++{ ++ typedef RegressionForestCounter type; ++}; ++ ++/** Given a column, choose a split that minimizes some loss ++ */ ++template ++class BestGiniOfColumn ++{ ++public: ++ ArrayVector class_weights_; ++ ArrayVector bestCurrentCounts[2]; ++ double min_gini_; ++ std::ptrdiff_t min_index_; ++ double min_threshold_; ++ ProblemSpec<> ext_param_; ++ ++ BestGiniOfColumn() ++ {} ++ ++ template ++ BestGiniOfColumn(ProblemSpec const & ext) ++ : ++ class_weights_(ext.class_weights_), ++ ext_param_(ext) ++ { ++ bestCurrentCounts[0].resize(ext.class_count_); ++ bestCurrentCounts[1].resize(ext.class_count_); ++ } ++ template ++ void set_external_parameters(ProblemSpec const & ext) ++ { ++ class_weights_ = ext.class_weights_; ++ ext_param_ = ext; ++ bestCurrentCounts[0].resize(ext.class_count_); ++ bestCurrentCounts[1].resize(ext.class_count_); ++ } ++ /** calculate the best gini split along a Feature Column ++ * \param column the feature vector - has to support the [] operator ++ * \param labels the label vector ++ * \param begin ++ * \param end (in and out) ++ * begin and end iterators to the indices of the ++ * samples in the current region. ++ * the range begin - end is sorted by the column supplied ++ * during function execution. ++ * \param region_response ++ * ??? ++ * class histogram of the range. ++ * ++ * precondition: begin, end valid range, ++ * class_counts positive integer valued array with the ++ * class counts in the current range. ++ * labels.size() >= max(begin, end); ++ * postcondition: ++ * begin, end sorted by column given. ++ * min_gini_ contains the minimum gini found or ++ * NumericTraits::max if no split was found. ++ * min_index_ contains the splitting index in the range ++ * or invalid data if no split was found. ++ * BestCirremtcounts[0] and [1] contain the ++ * class histogram of the left and right region of ++ * the left and right regions. ++ */ ++ template< class DataSourceF_t, ++ class DataSource_t, ++ class I_Iter, ++ class Array> ++ void operator()(DataSourceF_t const & column, ++ DataSource_t const & labels, ++ I_Iter & begin, ++ I_Iter & end, ++ Array const & region_response) ++ { ++ std::sort(begin, end, ++ SortSamplesByDimensions(column, 0)); ++ typedef typename ++ LossTraits::type LineSearchLoss; ++ LineSearchLoss left(labels, ext_param_); //initialize left and right region ++ LineSearchLoss right(labels, ext_param_); ++ ++ ++ ++ min_gini_ = right.init(begin, end, region_response); ++ min_threshold_ = *begin; ++ min_index_ = 0; //the starting point where to split ++ DimensionNotEqual comp(column, 0); ++ ++ I_Iter iter = begin; ++ I_Iter next = std::adjacent_find(iter, end, comp); ++ //std::cerr << std::distance(begin, end) << std::endl; ++ while( next != end) ++ { ++ double lr = right.decrement(iter, next + 1); ++ double ll = left.increment(iter , next + 1); ++ double loss = lr +ll; ++ //std::cerr <> in; ++ } ++ ++ template ++ double loss_of_region(DataSource_t const & labels, ++ Iter & begin, ++ Iter & end, ++ Array const & region_response) const ++ { ++ typedef typename ++ LossTraits::type LineSearchLoss; ++ LineSearchLoss region_loss(labels, ext_param_); ++ return ++ region_loss.init(begin, end, region_response); ++ } ++ ++}; ++ ++namespace detail ++{ ++ template ++ struct Correction ++ { ++ template ++ static void exec(Region & in, LabelT & labels) ++ {} ++ }; ++ ++ template<> ++ struct Correction ++ { ++ template ++ static void exec(Region & region, LabelT & labels) ++ { ++ if(std::accumulate(region.classCounts().begin(), ++ region.classCounts().end(), 0.0) != region.size()) ++ { ++ RandomForestClassCounter< LabelT, ++ ArrayVector > ++ counter(labels, region.classCounts()); ++ std::for_each( region.begin(), region.end(), counter); ++ region.classCountsIsValid = true; ++ } ++ } ++ }; ++} ++ ++/** Chooses mtry columns and applies ColumnDecisionFunctor to each of the ++ * columns. Then Chooses the column that is best ++ */ ++template ++class ThresholdSplit: public SplitBase ++{ ++ public: ++ ++ ++ typedef SplitBase SB; ++ ++ ArrayVector splitColumns; ++ ColumnDecisionFunctor bgfunc; ++ ++ double region_gini_; ++ ArrayVector min_gini_; ++ ArrayVector min_indices_; ++ ArrayVector min_thresholds_; ++ ++ int bestSplitIndex; ++ ++ double minGini() const ++ { ++ return min_gini_[bestSplitIndex]; ++ } ++ int bestSplitColumn() const ++ { ++ return splitColumns[bestSplitIndex]; ++ } ++ double bestSplitThreshold() const ++ { ++ return min_thresholds_[bestSplitIndex]; ++ } ++ ++ template ++ void set_external_parameters(ProblemSpec const & in) ++ { ++ SB::set_external_parameters(in); ++ bgfunc.set_external_parameters( SB::ext_param_); ++ int featureCount_ = SB::ext_param_.column_count_; ++ splitColumns.resize(featureCount_); ++ for(int k=0; k ++ int findBestSplit(MultiArrayView<2, T, C> features, ++ MultiArrayView<2, T2, C2> labels, ++ Region & region, ++ ArrayVector& childRegions, ++ Random & randint) ++ { ++ ++ typedef typename Region::IndexIterator IndexIterator; ++ if(region.size() == 0) ++ { ++ std::cerr << "SplitFunctor::findBestSplit(): stackentry with 0 examples encountered\n" ++ "continuing learning process...."; ++ } ++ // calculate things that haven't been calculated yet. ++ detail::Correction::exec(region, labels); ++ ++ ++ // Is the region pure already? ++ region_gini_ = bgfunc.loss_of_region(labels, ++ region.begin(), ++ region.end(), ++ region.classCounts()); ++ if(region_gini_ <= SB::ext_param_.precision_) ++ return this->makeTerminalNode(features, labels, region, randint); ++ ++ // select columns to be tried. ++ for(int ii = 0; ii < SB::ext_param_.actual_mtry_; ++ii) ++ std::swap(splitColumns[ii], ++ splitColumns[ii+ randint(features.shape(1) - ii)]); ++ ++ // find the best gini index ++ bestSplitIndex = 0; ++ double current_min_gini = region_gini_; ++ int num2try = features.shape(1); ++ for(int k=0; kmakeTerminalNode(features, labels, region, randint); ++ ++ //create a Node for output ++ Node node(SB::t_data, SB::p_data); ++ SB::node_ = node; ++ node.threshold() = min_thresholds_[bestSplitIndex]; ++ node.column() = splitColumns[bestSplitIndex]; ++ ++ // partition the range according to the best dimension ++ SortSamplesByDimensions > ++ sorter(features, node.column(), node.threshold()); ++ IndexIterator bestSplit = ++ std::partition(region.begin(), region.end(), sorter); ++ // Save the ranges of the child stack entries. ++ childRegions[0].setRange( region.begin() , bestSplit ); ++ childRegions[0].rule = region.rule; ++ childRegions[0].rule.push_back(std::make_pair(1, 1.0)); ++ childRegions[1].setRange( bestSplit , region.end() ); ++ childRegions[1].rule = region.rule; ++ childRegions[1].rule.push_back(std::make_pair(1, 1.0)); ++ ++ return i_ThresholdNode; ++ } ++}; ++ ++typedef ThresholdSplit > GiniSplit; ++typedef ThresholdSplit > EntropySplit; ++typedef ThresholdSplit, RegressionTag> RegressionSplit; ++ ++namespace rf ++{ ++ ++/** This namespace contains additional Splitfunctors. ++ * ++ * The Split functor classes are designed in a modular fashion because new split functors may ++ * share a lot of code with existing ones. ++ * ++ * ThresholdSplit implements the functionality needed for any split functor, that makes its ++ * decision via one dimensional axis-parallel cuts. The Template parameter defines how the split ++ * along one dimension is chosen. ++ * ++ * The BestGiniOfColumn class chooses a split that minimizes one of the Loss functions supplied ++ * (GiniCriterion for classification and LSQLoss for regression). Median chooses the Split in a ++ * kD tree fashion. ++ * ++ * ++ * Currently defined typedefs: ++ * \code ++ * typedef ThresholdSplit > GiniSplit; ++ * typedef ThresholdSplit, RegressionTag> RegressionSplit; ++ * typedef ThresholdSplit MedianSplit; ++ * \endcode ++ */ ++namespace split ++{ ++ ++/** This Functor chooses the median value of a column ++ */ ++class Median ++{ ++public: ++ ++ typedef GiniCriterion LineSearchLossTag; ++ ArrayVector class_weights_; ++ ArrayVector bestCurrentCounts[2]; ++ double min_gini_; ++ std::ptrdiff_t min_index_; ++ double min_threshold_; ++ ProblemSpec<> ext_param_; ++ ++ Median() ++ {} ++ ++ template ++ Median(ProblemSpec const & ext) ++ : ++ class_weights_(ext.class_weights_), ++ ext_param_(ext) ++ { ++ bestCurrentCounts[0].resize(ext.class_count_); ++ bestCurrentCounts[1].resize(ext.class_count_); ++ } ++ ++ template ++ void set_external_parameters(ProblemSpec const & ext) ++ { ++ class_weights_ = ext.class_weights_; ++ ext_param_ = ext; ++ bestCurrentCounts[0].resize(ext.class_count_); ++ bestCurrentCounts[1].resize(ext.class_count_); ++ } ++ ++ template< class DataSourceF_t, ++ class DataSource_t, ++ class I_Iter, ++ class Array> ++ void operator()(DataSourceF_t const & column, ++ DataSource_t const & labels, ++ I_Iter & begin, ++ I_Iter & end, ++ Array const & region_response) ++ { ++ std::sort(begin, end, ++ SortSamplesByDimensions(column, 0)); ++ typedef typename ++ LossTraits::type LineSearchLoss; ++ LineSearchLoss left(labels, ext_param_); ++ LineSearchLoss right(labels, ext_param_); ++ right.init(begin, end, region_response); ++ ++ min_gini_ = NumericTraits::max(); ++ min_index_ = floor(double(end - begin)/2.0); ++ min_threshold_ = column[*(begin + min_index_)]; ++ SortSamplesByDimensions ++ sorter(column, 0, min_threshold_); ++ I_Iter part = std::partition(begin, end, sorter); ++ DimensionNotEqual comp(column, 0); ++ if(part == begin) ++ { ++ part= std::adjacent_find(part, end, comp)+1; ++ ++ } ++ if(part >= end) ++ { ++ return; ++ } ++ else ++ { ++ min_threshold_ = column[*part]; ++ } ++ min_gini_ = right.decrement(begin, part) ++ + left.increment(begin , part); ++ ++ bestCurrentCounts[0] = left.response(); ++ bestCurrentCounts[1] = right.response(); ++ ++ min_index_ = part - begin; ++ } ++ ++ template ++ double loss_of_region(DataSource_t const & labels, ++ Iter & begin, ++ Iter & end, ++ Array const & region_response) const ++ { ++ typedef typename ++ LossTraits::type LineSearchLoss; ++ LineSearchLoss region_loss(labels, ext_param_); ++ return ++ region_loss.init(begin, end, region_response); ++ } ++ ++}; ++ ++typedef ThresholdSplit MedianSplit; ++ ++ ++/** This Functor chooses a random value of a column ++ */ ++class RandomSplitOfColumn ++{ ++public: ++ ++ typedef GiniCriterion LineSearchLossTag; ++ ArrayVector class_weights_; ++ ArrayVector bestCurrentCounts[2]; ++ double min_gini_; ++ std::ptrdiff_t min_index_; ++ double min_threshold_; ++ ProblemSpec<> ext_param_; ++ typedef RandomMT19937 Random_t; ++ Random_t random; ++ ++ RandomSplitOfColumn() ++ {} ++ ++ template ++ RandomSplitOfColumn(ProblemSpec const & ext) ++ : ++ class_weights_(ext.class_weights_), ++ ext_param_(ext), ++ random(RandomSeed) ++ { ++ bestCurrentCounts[0].resize(ext.class_count_); ++ bestCurrentCounts[1].resize(ext.class_count_); ++ } ++ ++ template ++ RandomSplitOfColumn(ProblemSpec const & ext, Random_t & random_) ++ : ++ class_weights_(ext.class_weights_), ++ ext_param_(ext), ++ random(random_) ++ { ++ bestCurrentCounts[0].resize(ext.class_count_); ++ bestCurrentCounts[1].resize(ext.class_count_); ++ } ++ ++ template ++ void set_external_parameters(ProblemSpec const & ext) ++ { ++ class_weights_ = ext.class_weights_; ++ ext_param_ = ext; ++ bestCurrentCounts[0].resize(ext.class_count_); ++ bestCurrentCounts[1].resize(ext.class_count_); ++ } ++ ++ template< class DataSourceF_t, ++ class DataSource_t, ++ class I_Iter, ++ class Array> ++ void operator()(DataSourceF_t const & column, ++ DataSource_t const & labels, ++ I_Iter & begin, ++ I_Iter & end, ++ Array const & region_response) ++ { ++ std::sort(begin, end, ++ SortSamplesByDimensions(column, 0)); ++ typedef typename ++ LossTraits::type LineSearchLoss; ++ LineSearchLoss left(labels, ext_param_); ++ LineSearchLoss right(labels, ext_param_); ++ right.init(begin, end, region_response); ++ ++ ++ min_gini_ = NumericTraits::max(); ++ int tmp_pt = random.uniformInt(std::distance(begin, end)); ++ min_index_ = tmp_pt; ++ min_threshold_ = column[*(begin + min_index_)]; ++ SortSamplesByDimensions ++ sorter(column, 0, min_threshold_); ++ I_Iter part = std::partition(begin, end, sorter); ++ DimensionNotEqual comp(column, 0); ++ if(part == begin) ++ { ++ part= std::adjacent_find(part, end, comp)+1; ++ ++ } ++ if(part >= end) ++ { ++ return; ++ } ++ else ++ { ++ min_threshold_ = column[*part]; ++ } ++ min_gini_ = right.decrement(begin, part) ++ + left.increment(begin , part); ++ ++ bestCurrentCounts[0] = left.response(); ++ bestCurrentCounts[1] = right.response(); ++ ++ min_index_ = part - begin; ++ } ++ ++ template ++ double loss_of_region(DataSource_t const & labels, ++ Iter & begin, ++ Iter & end, ++ Array const & region_response) const ++ { ++ typedef typename ++ LossTraits::type LineSearchLoss; ++ LineSearchLoss region_loss(labels, ext_param_); ++ return ++ region_loss.init(begin, end, region_response); ++ } ++ ++}; ++ ++typedef ThresholdSplit RandomSplit; ++} ++} ++ ++ ++} //namespace vigra ++#endif // VIGRA_RANDOM_FOREST_SPLIT_HXX +diff -urNb vigra-Version-1-10-0/include/vigra/random_forest.hxx Vigra-src/include/vigra/random_forest.hxx +--- vigra-Version-1-10-0/include/vigra/random_forest.hxx 2013-11-18 17:48:16.000000000 +0100 ++++ Vigra-src/include/vigra/random_forest.hxx 2014-01-23 13:23:00.000000000 +0100 +@@ -584,6 +584,7 @@ + { + vigra_precondition(features.shape(0) == labels.shape(0), + "RandomForest::predictLabels(): Label array has wrong size."); ++#pragma omp parallel for + for(int k=0; k currentRow(rowVector(features, row));