diff --git a/cmake/ShogunUtils.cmake b/cmake/ShogunUtils.cmake index 7c94069528a..eca6cdad079 100644 --- a/cmake/ShogunUtils.cmake +++ b/cmake/ShogunUtils.cmake @@ -75,31 +75,31 @@ ENDMACRO() # based on compiz_discover_tests function (shogun_discover_tests EXECUTABLE) - add_dependencies (${EXECUTABLE} discover_gtest_tests) - - add_custom_command (TARGET ${EXECUTABLE} - POST_BUILD - COMMAND ${CMAKE_COMMAND} -D UNIT_TEST_CMD=${CMAKE_BINARY_DIR}/bin/${EXECUTABLE} - -D DISCOVER_CMD=${CMAKE_BINARY_DIR}/bin/discover_gtest_tests - -D WORKING_DIR=${CMAKE_CURRENT_BINARY_DIR} - -P ${CMAKE_MODULE_PATH}/discover_unit_tests.cmake - COMMENT "Discovering Tests in ${EXECUTABLE}" - DEPENDS - VERBATIM) + add_dependencies (${EXECUTABLE} discover_gtest_tests) + + add_custom_command (TARGET ${EXECUTABLE} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -D UNIT_TEST_CMD=${CMAKE_BINARY_DIR}/bin/${EXECUTABLE} + -D DISCOVER_CMD=${CMAKE_BINARY_DIR}/bin/discover_gtest_tests + -D WORKING_DIR=${CMAKE_CURRENT_BINARY_DIR} + -P ${CMAKE_MODULE_PATH}/discover_unit_tests.cmake + COMMENT "Discovering Tests in ${EXECUTABLE}" + DEPENDS + VERBATIM) endfunction () MACRO(AddMetaIntegrationTest META_TARGET CONDITION) - IF (${CONDITION}) - add_test(NAME integration_meta_${META_TARGET}-${NAME_WITH_DIR} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMAND meta_example_integration_tester ${REL_DIR} ${NAME}.dat ${META_TARGET} generated_results reference_results) - set_tests_properties( - integration_meta_${META_TARGET}-${NAME_WITH_DIR} - PROPERTIES - LABELS "integration" - DEPENDS generated_${META_TARGET}-${NAME_WITH_DIR} - ) - ENDIF() + IF (${CONDITION}) + add_test(NAME integration_meta_${META_TARGET}-${NAME_WITH_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMAND meta_example_integration_tester ${REL_DIR} ${NAME}.dat ${META_TARGET} generated_results reference_results) + set_tests_properties( + integration_meta_${META_TARGET}-${NAME_WITH_DIR} + PROPERTIES + LABELS "integration" + DEPENDS generated_${META_TARGET}-${NAME_WITH_DIR} + ) + ENDIF() ENDMACRO() MACRO(AddLibShogunExample EXAMPLE_CPP) @@ -238,12 +238,12 @@ endfunction() function(SET_LINALG_BACKEND COMPONENT FLAG) OPTION(USE_EIGEN3_${FLAG} "Use ${COMPONENT} Eigen3" ON) CMAKE_DEPENDENT_OPTION( - USE_VIENNACL_${FLAG} "Use ${COMPONENT} ViennaCL" OFF - "VIENNACL_FOUND;USE_VIENNACL;NOT USE_EIGEN3_${FLAG}" ON) + USE_VIENNACL_${FLAG} "Use ${COMPONENT} ViennaCL" OFF + "VIENNACL_FOUND;USE_VIENNACL;NOT USE_EIGEN3_${FLAG}" ON) if(NOT ${COMPONENT}) - set(${COMPONENT} EIGEN3 CACHE STRING - "Set linear algebra backend ${COMPONENT}: EIGEN3, VIENNACL" - FORCE) + set(${COMPONENT} EIGEN3 CACHE STRING + "Set linear algebra backend ${COMPONENT}: EIGEN3, VIENNACL" + FORCE) endif() if (${COMPONENT} STREQUAL "EIGEN3") @@ -277,7 +277,7 @@ function(GET_META_EXAMPLE_VARS META_EXAMPLE EX_NAME REL_DIR NAME_WITH_DIR) endfunction() function(GET_INTERFACE_VARS INTERFACE DIRECTORY EXTENSION) - string(REGEX MATCH "INTERFACE_([a-zA-Z]+)" _dir ${INTERFACE}) + string(REGEX MATCH "INTERFACE_([a-zA-Z]+)" _dir ${INTERFACE}) STRING(TOLOWER "${CMAKE_MATCH_1}" _dir) SET(${DIRECTORY} ${_dir} PARENT_SCOPE) @@ -327,4 +327,4 @@ function(ADD_SHOGUN_BENCHMARK REL_BENCHMARK_NAME) if(ARGN) set_tests_properties(${BENCHMARK_NAME} PROPERTIES ${ARGN}) endif() -endfunction() +endfunction() \ No newline at end of file diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index ffc058f5e8a..647c48cb370 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -30,6 +30,9 @@ %shared_ptr(shogun::MemoryMappedFile) %shared_ptr(shogun::Compressor) +%shared_ptr(shogun::OpenMLFlow) +%shared_ptr(shogun::OpenMLRun) +%shared_ptr(shogun::OpenMLTask) #ifdef USE_BOOL %shared_ptr(shogun::StreamingFileFromSparseFeatures) %shared_ptr(shogun::StreamingFileFromDenseFeatures) @@ -183,6 +186,9 @@ namespace shogun %include %include +%include +%include +%include %include %include diff --git a/src/interfaces/swig/IO_includes.i b/src/interfaces/swig/IO_includes.i index d1d1fcbc8c8..8415b314423 100644 --- a/src/interfaces/swig/IO_includes.i +++ b/src/interfaces/swig/IO_includes.i @@ -13,6 +13,9 @@ #include #include +#include +#include +#include #include #include #include diff --git a/src/shogun/CMakeLists.txt b/src/shogun/CMakeLists.txt index 78a40e236a9..82491ab2fb2 100644 --- a/src/shogun/CMakeLists.txt +++ b/src/shogun/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES( - ${CMAKE_SOURCE_DIR}/src - ${CMAKE_BINARY_DIR}/src + ${CMAKE_SOURCE_DIR}/src + ${CMAKE_BINARY_DIR}/src ) include(GenerateExportHeader) @@ -46,23 +46,23 @@ set(SHOGUN_CLING_LIBRARY_DIR "\"${CMAKE_INSTALL_PREFIX}/${SHOGUN_LIB_INSTALL}\"" if (MSVC OR BUILD_BENCHMARKS) SET(LIBSHOGUN_BUILD_STATIC ON - CACHE BOOL "Build libshogun static library" FORCE) + CACHE BOOL "Build libshogun static library" FORCE) endif() # Allow to hide non-bsd compatible codes OPTION(LICENSE_GPL_SHOGUN "Include GPL codes of Shogun (non-BSD compatible) in build" ON) SET(USE_GPL_SHOGUN 0) IF (LICENSE_GPL_SHOGUN) - SET(SHOGUN_GPL_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/src/gpl) - IF(EXISTS "${SHOGUN_GPL_INCLUDE_DIR}/shogun") - SET(USE_GPL_SHOGUN 1) + SET(SHOGUN_GPL_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/src/gpl) + IF(EXISTS "${SHOGUN_GPL_INCLUDE_DIR}/shogun") + SET(USE_GPL_SHOGUN 1) - FILE(GLOB_RECURSE GPL_LIBSHOGUN_SRC ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_CPP} ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_C}) - FILE(GLOB_RECURSE GPL_LIBSHOGUN_HEADERS ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_HEADER}) + FILE(GLOB_RECURSE GPL_LIBSHOGUN_SRC ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_CPP} ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_C}) + FILE(GLOB_RECURSE GPL_LIBSHOGUN_HEADERS ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_HEADER}) - LIST(APPEND LIBSHOGUN_SRC ${GPL_LIBSHOGUN_SRC}) - LIST(APPEND LIBSHOGUN_HEADERS ${GPL_LIBSHOGUN_HEADERS}) - INCLUDE_DIRECTORIES(${SHOGUN_GPL_INCLUDE_DIR}) + LIST(APPEND LIBSHOGUN_SRC ${GPL_LIBSHOGUN_SRC}) + LIST(APPEND LIBSHOGUN_HEADERS ${GPL_LIBSHOGUN_HEADERS}) + INCLUDE_DIRECTORIES(${SHOGUN_GPL_INCLUDE_DIR}) SET(USE_GPL_SHOGUN 1) ELSE() @@ -72,7 +72,7 @@ ENDIF() OPTION(USE_SVMLIGHT "SVMLight" ON) IF(USE_SVMLIGHT AND NOT USE_GPL_SHOGUN) - MESSAGE(FATAL_ERROR "Can only use SVMLight when GPL codes are included. Disable by USE_SVMLIGHT=OFF") + MESSAGE(FATAL_ERROR "Can only use SVMLight when GPL codes are included. Disable by USE_SVMLIGHT=OFF") ENDIF() # add target to compile the libshogun sources @@ -84,13 +84,13 @@ ENDIF() # Generate versionstring.h ADD_CUSTOM_TARGET( - version - ${CMAKE_COMMAND} - -D SRC=${CMAKE_CURRENT_SOURCE_DIR}/lib/versionstring.h.in - -D DST=${CMAKE_CURRENT_BINARY_DIR}/lib/versionstring.h - -D ROOT_DIR=${CMAKE_SOURCE_DIR} - -P ${CMAKE_SOURCE_DIR}/cmake/version.cmake - COMMENT "Generating version header" + version + ${CMAKE_COMMAND} + -D SRC=${CMAKE_CURRENT_SOURCE_DIR}/lib/versionstring.h.in + -D DST=${CMAKE_CURRENT_BINARY_DIR}/lib/versionstring.h + -D ROOT_DIR=${CMAKE_SOURCE_DIR} + -P ${CMAKE_SOURCE_DIR}/cmake/version.cmake + COMMENT "Generating version header" ) add_dependencies(libshogun version) @@ -109,12 +109,12 @@ FOREACH(template ${LIBSHOGUN_SRC_TMP}) ENDFOREACH() ADD_CUSTOM_COMMAND(OUTPUT ${generated_cpp} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${template} ${generated_cpp} - COMMAND ${PYTHON_EXECUTABLE} ${generator_script} ${generated_cpp} -in ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt - DEPENDS ${template} ${generator_script} version ${LIBSHOGUN_HEADERS} ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMENT "Generating ${generated_cpp}" - ) + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${template} ${generated_cpp} + COMMAND ${PYTHON_EXECUTABLE} ${generator_script} ${generated_cpp} -in ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt + DEPENDS ${template} ${generator_script} version ${LIBSHOGUN_HEADERS} ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Generating ${generated_cpp}" + ) ADD_CUSTOM_TARGET(${generated_target} DEPENDS ${generated_cpp}) add_dependencies(libshogun ${generated_target}) @@ -125,18 +125,18 @@ ENDFOREACH() add_library(shogun SHARED $ ${CMAKE_CURRENT_BINARY_DIR}/lib/config.h) generate_export_header(shogun) set_target_properties( - shogun PROPERTIES - VERSION ${LIBSHOGUNVER} - SOVERSION ${LIBSHOGUNSO} - INTERFACE_shogun_MAJOR_VERSION ${SHOGUN_VERSION_MAJOR}) + shogun PROPERTIES + VERSION ${LIBSHOGUNVER} + SOVERSION ${LIBSHOGUNSO} + INTERFACE_shogun_MAJOR_VERSION ${SHOGUN_VERSION_MAJOR}) set_property(TARGET shogun APPEND PROPERTY COMPATIBLE_INTERFACE_STRING ${SHOGUN_VERSION_MAJOR}) target_include_directories(shogun PUBLIC - $ - $ - $ -) + $ + $ + $ + ) IF (${USE_GPL_SHOGUN}) - target_include_directories(shogun PUBLIC $) + target_include_directories(shogun PUBLIC $) ENDIF() get_property(dirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) @@ -149,11 +149,11 @@ endforeach() add_library(shogun_deps INTERFACE) target_link_libraries(shogun_deps INTERFACE shogun) target_include_directories(shogun_deps INTERFACE - $ - $ -) + $ + $ + ) IF (${USE_GPL_SHOGUN}) - target_include_directories(shogun_deps INTERFACE $) + target_include_directories(shogun_deps INTERFACE $) ENDIF() # add target for static library if enabled @@ -161,10 +161,10 @@ if (LIBSHOGUN_BUILD_STATIC) add_library(shogun-static STATIC $ ${CMAKE_CURRENT_BINARY_DIR}/lib/config.h) set_property(TARGET shogun-static PROPERTY OUTPUT_NAME shogun) target_include_directories(shogun-static PUBLIC - $ - $ - $ - ) + $ + $ + $ + ) IF (${USE_GPL_SHOGUN}) target_include_directories(shogun-static PUBLIC $) ENDIF() @@ -181,9 +181,9 @@ IF(MSVC) IF(MSVC_VERSION VERSION_LESS "1700") include(external/MSIntTypes) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC - $ - $ - ) + $ + $ + ) ENDIF() target_link_libraries(shogun PUBLIC winmm Shlwapi) @@ -301,49 +301,49 @@ endif() include(external/StanMath) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) ####### LINALG # ViennaCL detection SHOGUN_DEPENDENCIES( - LIBRARY ViennaCL - SCOPE PRIVATE - VERSION ${VIENNACL_VERSION_MINIMUM} - CONFIG_FLAG HAVE_VIENNACL) + LIBRARY ViennaCL + SCOPE PRIVATE + VERSION ${VIENNACL_VERSION_MINIMUM} + CONFIG_FLAG HAVE_VIENNACL) ####### /LINALG FIND_PACKAGE(rxcpp) IF(NOT rxcpp_FOUND) - include(external/rxcpp) - SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ - ) + include(external/rxcpp) + SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM + $ + $ + ) ELSE() - SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM ${rxcpp_INCLUDE_DIR}) + SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM ${rxcpp_INCLUDE_DIR}) ENDIF() # TFLogger package FIND_PACKAGE(TFLogger 0.1.0 CONFIG) IF (TFLogger_FOUND) - SET(HAVE_TFLOGGER 1) - SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE SYSTEM ${TFLogger_INCLUDE_DIR}) - target_link_libraries(shogun PRIVATE tflogger::tflogger) + SET(HAVE_TFLOGGER 1) + SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE SYSTEM ${TFLogger_INCLUDE_DIR}) + target_link_libraries(shogun PRIVATE tflogger::tflogger) ENDIF() #### LAPACK @@ -354,9 +354,9 @@ CHECK_CXX_SOURCE_COMPILES("#include \n int main(int argc, char** argv) IF (NOT HAVE_STD_VARIANT) include(external/variant) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ - ) + $ + $ + ) ENDIF() # spdlog @@ -376,9 +376,9 @@ ELSE() ENDIF() SHOGUN_DEPENDENCIES( - LIBRARY GLPK - SCOPE PRIVATE - CONFIG_FLAG USE_GLPK) + LIBRARY GLPK + SCOPE PRIVATE + CONFIG_FLAG USE_GLPK) SHOGUN_DEPENDENCIES( LIBRARY LibArchive @@ -391,19 +391,24 @@ SHOGUN_DEPENDENCIES( CONFIG_FLAG USE_CPLEX) SHOGUN_DEPENDENCIES( - LIBRARY ARPACK - SCOPE PRIVATE - CONFIG_FLAG HAVE_ARPACK) + LIBRARY CPLEX + SCOPE PRIVATE + CONFIG_FLAG USE_CPLEX) SHOGUN_DEPENDENCIES( - LIBRARY Mosek - SCOPE PRIVATE - CONFIG_FLAG USE_MOSEK) + LIBRARY ARPACK + SCOPE PRIVATE + CONFIG_FLAG HAVE_ARPACK) + +SHOGUN_DEPENDENCIES( + LIBRARY Mosek + SCOPE PRIVATE + CONFIG_FLAG USE_MOSEK) SHOGUN_DEPENDENCIES( - LIBRARY Protobuf - SCOPE PUBLIC - CONFIG_FLAG HAVE_PROTOBUF) + LIBRARY Protobuf + SCOPE PUBLIC + CONFIG_FLAG HAVE_PROTOBUF) IF (PROTOBUF_FOUND AND ENABLE_PROTOBUF) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/io/protobuf) @@ -414,12 +419,12 @@ IF (PROTOBUF_FOUND AND ENABLE_PROTOBUF) SET(name "${CMAKE_CURRENT_BINARY_DIR}/io/protobuf/${FIL_WE}") ADD_CUSTOM_COMMAND( - OUTPUT "${name}.pb.cc" "${name}.pb.h" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${FIL} - --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/io/protobuf - --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/io/protobuf - COMMENT "Running C++ protocol buffer compiler on ${FIL_WE}.proto" - ) + OUTPUT "${name}.pb.cc" "${name}.pb.h" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${FIL} + --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/io/protobuf + --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/io/protobuf + COMMENT "Running C++ protocol buffer compiler on ${FIL_WE}.proto" + ) ADD_CUSTOM_TARGET("${FIL_WE}Protobuf" DEPENDS "${name}.pb.cc") add_dependencies(libshogun "${FIL_WE}Protobuf") set_source_files_properties(${name}.pb.cc PROPERTIES GENERATED ON) @@ -441,40 +446,40 @@ if (NOT WIN32) # FIXME: HDF5 linking on WIN32 is broken. # at least with the hdf5 supplied in anaconda SHOGUN_DEPENDENCIES( - LIBRARY HDF5 - SCOPE PUBLIC - CONFIG_FLAG HAVE_HDF5) + LIBRARY HDF5 + SCOPE PUBLIC + CONFIG_FLAG HAVE_HDF5) endif () SHOGUN_DEPENDENCIES( - LIBRARY CURL - SCOPE PRIVATE - CONFIG_FLAG HAVE_CURL) + LIBRARY CURL + SCOPE PRIVATE + CONFIG_FLAG HAVE_CURL) SHOGUN_DEPENDENCIES( - LIBRARY ZLIB - SCOPE PRIVATE - CONFIG_FLAG USE_GZIP) + LIBRARY ZLIB + SCOPE PRIVATE + CONFIG_FLAG USE_GZIP) SHOGUN_DEPENDENCIES( - LIBRARY BZip2 - SCOPE PRIVATE - CONFIG_FLAG USE_BZIP2) + LIBRARY BZip2 + SCOPE PRIVATE + CONFIG_FLAG USE_BZIP2) SHOGUN_DEPENDENCIES( - LIBRARY LibLZMA - SCOPE PRIVATE - CONFIG_FLAG USE_LZMA) + LIBRARY LibLZMA + SCOPE PRIVATE + CONFIG_FLAG USE_LZMA) SHOGUN_DEPENDENCIES( - LIBRARY SNAPPY - SCOPE PRIVATE - CONFIG_FLAG USE_SNAPPY) + LIBRARY SNAPPY + SCOPE PRIVATE + CONFIG_FLAG USE_SNAPPY) SHOGUN_DEPENDENCIES( - LIBRARY LZO - SCOPE PRIVATE - CONFIG_FLAG USE_LZO) + LIBRARY LZO + SCOPE PRIVATE + CONFIG_FLAG USE_LZO) #integration OPTION(OpenCV "OpenCV Integration" OFF) @@ -502,25 +507,25 @@ IF(BUNDLE_NLOPT) SET(POSTLINKFLAGS ${POSTLINKFLAGS} ${NLOPT_LIBRARIES}) ELSE() SHOGUN_DEPENDENCIES( - LIBRARY NLopt - SCOPE PRIVATE - CONFIG_FLAG HAVE_NLOPT) + LIBRARY NLopt + SCOPE PRIVATE + CONFIG_FLAG HAVE_NLOPT) ENDIF() SHOGUN_DEPENDENCIES( - LIBRARY LpSolve - SCOPE PRIVATE - CONFIG_FLAG USE_LPSOLVE) + LIBRARY LpSolve + SCOPE PRIVATE + CONFIG_FLAG USE_LPSOLVE) SHOGUN_DEPENDENCIES( - LIBRARY ColPack - SCOPE PRIVATE - CONFIG_FLAG HAVE_COLPACK) + LIBRARY ColPack + SCOPE PRIVATE + CONFIG_FLAG HAVE_COLPACK) SHOGUN_DEPENDENCIES( - LIBRARY ARPREC - SCOPE PRIVATE - CONFIG_FLAG HAVE_ARPREC) + LIBRARY ARPREC + SCOPE PRIVATE + CONFIG_FLAG HAVE_ARPREC) ############################ HMM OPTION(USE_HMMDEBUG "HMM debug mode" OFF) @@ -547,7 +552,7 @@ OPTION(USE_LOGCACHE "Use (1+exp(x)) log cache (is much faster but less accurate) ################## linker optimisations OPTION(INCREMENTAL_LINKING "Enable incremantal linking") SET(INCREMENTAL_LINKING_DIR ${CMAKE_BINARY_DIR}/linker_cache - CACHE PATH "Incremental linking cache directory") + CACHE PATH "Incremental linking cache directory") OPTION(ENABLE_LTO "Enable link-time optimisation") include(LDGOLD) @@ -569,7 +574,7 @@ target_link_libraries(shogun PUBLIC ${POSTLINKFLAGS}) IF (BUILD_BENCHMARKS) IF (NOT BUILD_TYPE_UC STREQUAL "RELEASE") MESSAGE(WARNING "You are building benchmark against non-Release mode!" - "The results might not reflect the actual runtimes...") + "The results might not reflect the actual runtimes...") ENDIF() find_package(benchmark CONFIG REQUIRED) @@ -579,16 +584,16 @@ IF (BUILD_BENCHMARKS) target_link_libraries(shogun_benchmark_main benchmark::benchmark shogun-static) elseif(MSVC) target_link_libraries(shogun_benchmark_main - benchmark::benchmark - Shlwapi.lib - shogun-static - ) + benchmark::benchmark + Shlwapi.lib + shogun-static + ) else() target_link_libraries(shogun_benchmark_main - benchmark::benchmark - pthread - shogun-static - ) + benchmark::benchmark + pthread + shogun-static + ) endif() set(SHOGUN_BENCHMARK_LINK_LIBS shogun_benchmark_main) @@ -611,15 +616,15 @@ IF (LICENSE_GPL_SHOGUN) ENDIF() INSTALL( - DIRECTORY ${INCLUDE_HEADERS_DIR_LIST} - DESTINATION ${INCLUDE_INSTALL_DIR} - COMPONENT headers - FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" - PATTERN "CMakeFiles*" EXCLUDE - PATTERN "*.dir" EXCLUDE - PATTERN "*Debug*" EXCLUDE - PATTERN "*Release*" EXCLUDE - PATTERN ".settings" EXCLUDE) + DIRECTORY ${INCLUDE_HEADERS_DIR_LIST} + DESTINATION ${INCLUDE_INSTALL_DIR} + COMPONENT headers + FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" + PATTERN "CMakeFiles*" EXCLUDE + PATTERN "*.dir" EXCLUDE + PATTERN "*Debug*" EXCLUDE + PATTERN "*Release*" EXCLUDE + PATTERN ".settings" EXCLUDE) # set the desidered targets to be installed set(INSTALL_TARGETS shogun) @@ -628,41 +633,41 @@ if (LIBSHOGUN_BUILD_STATIC) endif() INSTALL( - TARGETS ${INSTALL_TARGETS} - EXPORT ShogunTargets - ARCHIVE DESTINATION ${SHOGUN_LIB_INSTALL} - RUNTIME DESTINATION ${SHOGUN_LIB_INSTALL} - LIBRARY DESTINATION ${SHOGUN_LIB_INSTALL} - INCLUDES DESTINATION ${INCLUDE_INSTALL_DIR} + TARGETS ${INSTALL_TARGETS} + EXPORT ShogunTargets + ARCHIVE DESTINATION ${SHOGUN_LIB_INSTALL} + RUNTIME DESTINATION ${SHOGUN_LIB_INSTALL} + LIBRARY DESTINATION ${SHOGUN_LIB_INSTALL} + INCLUDES DESTINATION ${INCLUDE_INSTALL_DIR} ) file(TO_CMAKE_PATH ${SHOGUN_LIB_INSTALL}/cmake/shogun CONFIG_PACKAGE_DIR) configure_package_config_file( - ${CMAKE_SOURCE_DIR}/cmake/ShogunConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake - INSTALL_DESTINATION ${SHOGUN_LIB_INSTALL}/cmake/shogun - PATH_VARS INCLUDE_INSTALL_DIR CONFIG_PACKAGE_DIR) + ${CMAKE_SOURCE_DIR}/cmake/ShogunConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake + INSTALL_DESTINATION ${SHOGUN_LIB_INSTALL}/cmake/shogun + PATH_VARS INCLUDE_INSTALL_DIR CONFIG_PACKAGE_DIR) write_basic_package_version_file( - "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" - VERSION ${VERSION} - COMPATIBILITY SameMajorVersion + "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" + VERSION ${VERSION} + COMPATIBILITY SameMajorVersion ) export(EXPORT ShogunTargets - FILE "${CMAKE_CURRENT_BINARY_DIR}/ShogunTargets.cmake" - NAMESPACE shogun:: -) + FILE "${CMAKE_CURRENT_BINARY_DIR}/ShogunTargets.cmake" + NAMESPACE shogun:: + ) install( - EXPORT ShogunTargets - FILE ShogunTargets.cmake - NAMESPACE shogun:: - DESTINATION ${CONFIG_PACKAGE_DIR} + EXPORT ShogunTargets + FILE ShogunTargets.cmake + NAMESPACE shogun:: + DESTINATION ${CONFIG_PACKAGE_DIR} ) install( - FILES - "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake" - "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" - DESTINATION ${CONFIG_PACKAGE_DIR} -) + FILES + "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" + DESTINATION ${CONFIG_PACKAGE_DIR} +) \ No newline at end of file diff --git a/src/shogun/features/CombinedFeatures.cpp b/src/shogun/features/CombinedFeatures.cpp index 5fedc885f70..9cf72f5748b 100644 --- a/src/shogun/features/CombinedFeatures.cpp +++ b/src/shogun/features/CombinedFeatures.cpp @@ -161,7 +161,7 @@ int32_t CombinedFeatures::get_num_feature_obj() const void CombinedFeatures::init() { SG_ADD(&num_vec, "num_vec", "Number of vectors."); - SG_ADD(&feature_array, "feature_array", "Feature array."); + watch_param("feature_array", &feature_array); } std::shared_ptr CombinedFeatures::create_merged_copy(std::shared_ptr other) const diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp new file mode 100644 index 00000000000..4906c6db345 --- /dev/null +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -0,0 +1,271 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +#include +#include +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + +std::shared_ptr +OpenMLData::get_dataset(const std::string& id, const std::string& api_key) +{ + // description + auto reader = OpenMLFile(api_key); + auto return_string = reader.get("dataset_description", "json", id); + + auto& dataset_description = check_response( + return_string, "data_set_description"); + + auto name = return_if_possible( + "name", dataset_description.GetObject()); + auto description = return_if_possible( + "description", dataset_description.GetObject()); + auto data_format = return_if_possible( + "data_format", dataset_description.GetObject()); + auto dataset_id = + return_if_possible("id", dataset_description.GetObject()); + auto version = return_if_possible( + "version", dataset_description.GetObject()); + auto creator = return_if_possible( + "creator", dataset_description.GetObject()); + auto contributor = return_if_possible( + "contributor", dataset_description.GetObject()); + auto collection_date = return_if_possible( + "collection_date", dataset_description.GetObject()); + auto upload_date = return_if_possible( + "upload_date", dataset_description.GetObject()); + auto language = return_if_possible( + "language", dataset_description.GetObject()); + auto licence = return_if_possible( + "licence", dataset_description.GetObject()); + auto url = + return_if_possible("url", dataset_description.GetObject()); + auto default_target_attribute = return_if_possible( + "default_target_attribute", dataset_description.GetObject()); + auto row_id_attribute = return_if_possible( + "row_id_attribute", dataset_description.GetObject()); + auto ignore_attribute = return_if_possible( + "ignore_attribute", dataset_description.GetObject()); + auto version_label = return_if_possible( + "version_label", dataset_description.GetObject()); + auto citation = return_if_possible( + "citation", dataset_description.GetObject()); + auto tags = return_if_possible>( + "tag", dataset_description.GetObject()); + auto visibility = return_if_possible( + "visibility", dataset_description.GetObject()); + auto original_data_url = return_if_possible( + "original_data_url", dataset_description.GetObject()); + auto paper_url = return_if_possible( + "paper_url", dataset_description.GetObject()); + auto update_comment = return_if_possible( + "update_comment", dataset_description.GetObject()); + auto md5_checksum = return_if_possible( + "md5_checksum", dataset_description.GetObject()); + + // features + std::vector>> + param_vector; + return_string = reader.get("data_features", "json", id); + + auto& dataset_features = + check_response(return_string, "data_features"); + + for (const auto& param : dataset_features["feature"].GetArray()) + { + std::unordered_map> param_map; + for (const auto& param_descriptors : param.GetObject()) + { + std::vector second; + if (param_descriptors.value.IsArray()) + for (const auto& v : param_descriptors.value.GetArray()) + second.emplace_back(v.GetString()); + else + second.emplace_back(param_descriptors.value.GetString()); + + param_map.emplace(param_descriptors.name.GetString(), second); + } + param_vector.push_back(param_map); + } + + // qualities + std::vector> qualities_vector; + return_string = reader.get("data_qualities", "json", id); + + auto& data_qualities = + check_response(return_string, "data_qualities"); + + for (const auto& param : data_qualities["quality"].GetArray()) + { + std::unordered_map param_map; + for (const auto& param_quality : param.GetObject()) + { + if (param_quality.name.IsString() && param_quality.value.IsString()) + param_map.emplace( + param_quality.name.GetString(), + param_quality.value.GetString()); + else if (param_quality.name.IsString()) + param_map.emplace(param_quality.name.GetString(), ""); + } + qualities_vector.push_back(param_map); + } + + auto result = std::make_shared( + name, description, data_format, dataset_id, version, creator, + contributor, collection_date, upload_date, language, licence, url, + default_target_attribute, row_id_attribute, ignore_attribute, + version_label, citation, tags, visibility, original_data_url, paper_url, + update_comment, md5_checksum, param_vector, qualities_vector); + result->set_api_key(api_key); + return result; +} + +std::shared_ptr OpenMLData::get_features() noexcept +{ + // if (!m_cached_features) + // get_data(); + // return m_cached_features; + SG_SNOTIMPLEMENTED + return nullptr; +} + +std::shared_ptr OpenMLData::get_features(const std::string& label) +{ + if (m_cached_features.empty()) + get_data(); + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label); + if (find_label == m_feature_names.end()) + SG_SERROR("Requested label \"%s\" not in the dataset!\n", label.c_str()) + auto col_idx = std::distance(m_feature_names.begin(), find_label); + auto feat_type_copy = m_feature_types; + feat_type_copy.erase(feat_type_copy.begin() + col_idx); + for (const auto type : feat_type_copy) + { + if (type == Attribute::STRING) + SG_SNOTIMPLEMENTED + } + + std::shared_ptr> result; + bool first = true; + size_t n_examples = 0; + for (int i = 0; i < m_feature_types.size(); ++i) + { + if (i != col_idx && first) + { + result.reset(m_cached_features[0]->as>()); + n_examples = result->get_num_vectors(); + first = false; + } + if (i != col_idx) + { + REQUIRE( + n_examples == m_cached_features[i]->get_num_vectors(), + "Expected all features to have the same number of examples!\n") + result.reset(result->create_merged_copy(m_cached_features[i].get()) + ->as>()); + } + } + + REQUIRE(n_examples != 0, "No features extracted!\n") + + // need to copy data as result is only in the stack and the data + // will be gone at the end of the function + auto* copy_feat = SG_MALLOC(float64_t, m_feature_types.size() * n_examples); + memcpy( + copy_feat, result->get_feature_matrix().data(), + m_feature_types.size() * m_cached_features.size()); + + return std::make_shared>( + copy_feat, m_feature_types.size(), n_examples); +} + +std::shared_ptr OpenMLData::get_labels() +{ + REQUIRE( + !m_default_target_attribute.empty(), + "A default target attribute is required if no label is given!\n") + return get_labels(m_default_target_attribute); +} + +std::shared_ptr OpenMLData::get_labels(const std::string& label_name) +{ + if (m_cached_labels && label_name == m_cached_label_name) + return m_cached_labels; + + if (m_cached_features.empty()) + get_data(); + + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label_name); + if (find_label == m_feature_names.end()) + SG_SERROR( + "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) + auto col_idx = std::distance(m_feature_names.begin(), find_label); + + std::shared_ptr target_label_as_feat = + m_cached_features[col_idx]; + + switch (m_feature_types[col_idx]) + { + // real features + case Attribute::REAL: + case Attribute::NUMERIC: + case Attribute::INTEGER: + case Attribute::DATE: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + auto labels = std::make_shared(labels_vec); + m_cached_labels = labels; + m_cached_label_name = label_name; + return m_cached_labels; + } + break; + // nominal features + case Attribute::NOMINAL: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + for (auto& val : labels_vec) + { + if (val == 0) + val = -1; + } + auto labels = std::make_shared(labels_vec); + m_cached_labels = labels; + m_cached_label_name = label_name; + return m_cached_labels; + } + break; + default: + SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) + } + + return nullptr; +} + +void OpenMLData::get_data() +{ + auto reader = OpenMLFile(m_api_key); + std::shared_ptr ss = + std::make_shared(reader.get(m_url)); + + auto parser = ARFFDeserializer(ss); + parser.read(); + m_cached_features = parser.get_features(); + m_feature_names = parser.get_feature_names(); + m_feature_types = parser.get_attribute_types(); +} diff --git a/src/shogun/io/openml/OpenMLData.h b/src/shogun/io/openml/OpenMLData.h new file mode 100644 index 00000000000..022eb06ad8d --- /dev/null +++ b/src/shogun/io/openml/OpenMLData.h @@ -0,0 +1,162 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLDATA_H +#define SHOGUN_OPENMLDATA_H + +#include +#include + +#include +#include +#include + +namespace shogun +{ + /** + * Handles an OpenML dataset. + */ + class OpenMLData + { + public: + OpenMLData( + const std::string& name, const std::string& description, + const std::string& data_format, const std::string& dataset_id, + const std::string& version, const std::string& creator, + const std::string& contributor, const std::string& collection_date, + const std::string& upload_date, const std::string& language, + const std::string& license, const std::string& url, + const std::string& default_target_attribute, + const std::string& row_id_attribute, + const std::string& ignore_attribute, + const std::string& version_label, const std::string& citation, + std::vector tag, const std::string& visibility, + const std::string& original_data_url, const std::string& paper_url, + const std::string& update_comment, const std::string& md5_checksum, + std::vector< + std::unordered_map> + + > + param_descriptors, + std::vector> + param_qualities) + : + + m_name(name), m_description(description), + m_data_format(data_format), m_dataset_id(dataset_id), + m_version(version), m_creator(creator), + m_contributor(contributor), m_collection_date(collection_date), + m_upload_date(upload_date), m_language(language), + m_license(license), m_url(url), + m_default_target_attribute(default_target_attribute), + m_row_id_attribute(row_id_attribute), + m_ignore_attribute(ignore_attribute), + m_version_label(version_label), m_citation(citation), + m_tag(std::move(tag)), m_visibility(visibility), + m_original_data_url(original_data_url), m_paper_url(paper_url), + m_update_comment(update_comment), m_md5_checksum(md5_checksum), + m_param_descriptors(std::move(param_descriptors)), + m_param_qualities(std::move(param_qualities)) + { + } + + /** + * Creates a dataset instance from a given ID. + * + */ + static std::shared_ptr + get_dataset(const std::string& id, const std::string& api_key); + + /** + * Returns ALL the features of the dataset, potentially also the labels + * column + * @return the features + */ + std::shared_ptr get_features() noexcept; + + /** + * Returns the dataset features + * @param label_name the name of the attribute containing the label + * @return the features + */ + std::shared_ptr get_features(const std::string& label_name); + + /** + * Returns the dataset labels if m_default_target_attribute is not empty + * @return the labels + */ + std::shared_ptr get_labels(); + + /** + * Returns the dataset labels given the label_name + * @return the labels + */ + std::shared_ptr get_labels(const std::string& label_name); + + /** + * Returns the type of all attributes/features in the ARFF file + * @return + */ + SG_FORCED_INLINE std::vector get_feature_types() const + noexcept + { + return m_feature_types; + } + + SG_FORCED_INLINE std::string get_default_target_attribute() const + noexcept + { + return m_default_target_attribute; + } + + protected: + SG_FORCED_INLINE void set_api_key(const std::string& api_key) noexcept + { + m_api_key = api_key; + } + + private: + void get_data(); + + std::string m_name; + std::string m_description; + std::string m_data_format; + std::string m_dataset_id; + std::string m_version; + std::string m_creator; + std::string m_contributor; + std::string m_collection_date; + std::string m_upload_date; + std::string m_language; + std::string m_license; + std::string m_url; + std::string m_default_target_attribute; + std::string m_row_id_attribute; + std::string m_ignore_attribute; + std::string m_version_label; + std::string m_citation; + std::vector m_tag; + std::string m_visibility; + std::string m_original_data_url; + std::string m_paper_url; + std::string m_update_comment; + std::string m_md5_checksum; + std::vector>> + m_param_descriptors; + std::vector> + m_param_qualities; + std::string m_api_key; + + std::vector> m_cached_features; + std::vector m_feature_names; + std::vector m_feature_types; + std::shared_ptr m_cached_labels; + std::string m_cached_label_name; + }; + +} // namespace shogun + +#endif // SHOGUN_OPENMLDATA_H diff --git a/src/shogun/io/openml/OpenMLFile.cpp b/src/shogun/io/openml/OpenMLFile.cpp new file mode 100644 index 00000000000..3e20e4b264f --- /dev/null +++ b/src/shogun/io/openml/OpenMLFile.cpp @@ -0,0 +1,117 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#ifdef HAVE_CURL +#include "OpenMLFile.h" +#include + +#endif // HAVE_CURL + +using namespace shogun; + +/** + * The writer callback function used to write the packets to a C++ string. + * @param data the data received in CURL request + * @param size always 1 + * @param nmemb the size of data + * @param buffer_in the buffer to write to + * @return the size of buffer that was written + */ +size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) +{ + // check that the buffer string points to something + if (buffer_in != nullptr) + { + // Append the data to the buffer + buffer_in->append(data, size * nmemb); + + return size * nmemb; + } + return 0; +} + +/* OpenML server format */ +const char* OpenMLFile::xml_server = "https://www.openml.org/api/v1/xml"; +const char* OpenMLFile::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLFile::download_server = ""; +const char* OpenMLFile::splits_server = "https://www.openml.org/api_splits"; + +/* DATA API */ +const char* OpenMLFile::dataset_description = "/data/{}"; +const char* OpenMLFile::list_data_qualities = "/data/qualities/list"; +const char* OpenMLFile::data_features = "/data/features/{}"; +const char* OpenMLFile::data_qualities = "/data/qualities/{}"; +const char* OpenMLFile::list_dataset_qualities = "/data/qualities/{}"; +const char* OpenMLFile::list_dataset_filter = "/data/list/{}"; +/* FLOW API */ +const char* OpenMLFile::flow_file = "/flow/{}"; +const char* OpenMLFile::flow_exists = "/flow/exists/{}"; +/* TASK API */ +const char* OpenMLFile::task_file = "/task/{}"; +/* SPLIT API */ +const char* OpenMLFile::get_split = ""; + +const std::unordered_map + OpenMLFile::m_format_options = {{"xml", xml_server}, + {"json", json_server}, + {"split", splits_server}, + {"download", download_server}}; +const std::unordered_map + OpenMLFile::m_request_options = { + {"dataset_description", dataset_description}, + {"list_data_qualities", list_data_qualities}, + {"data_features", data_features}, + {"data_qualities", data_qualities}, + {"list_dataset_qualities", list_dataset_qualities}, + {"list_dataset_filter", list_dataset_filter}, + {"flow_file", flow_file}, + {"flow_exists", flow_exists}, + {"task_file", task_file}}; + +void OpenMLFile::openml_curl_request_helper(const std::string& url) +{ +#ifdef HAVE_CURL + auto curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.\n") + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); + + CURLcode res = curl_easy_perform(curl_handle); + + if (res != CURLE_OK) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) + + curl_easy_cleanup(curl_handle); +#endif // HAVE_CURL +} + +std::string OpenMLFile::encode_string(const std::string& s) +{ +#ifdef HAVE_CURL + auto curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.\n") + } + + char* encoded_url = curl_easy_escape(curl_handle, s.c_str(), s.size()); + if (!encoded_url) + SG_SERROR("Failed to encode \"%s\" URL escaped.\n", s.c_str()) + return encoded_url; +#else + return s; +#endif +} diff --git a/src/shogun/io/openml/OpenMLFile.h b/src/shogun/io/openml/OpenMLFile.h new file mode 100644 index 00000000000..ef1796b09ff --- /dev/null +++ b/src/shogun/io/openml/OpenMLFile.h @@ -0,0 +1,190 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLREADER_H +#define SHOGUN_OPENMLREADER_H + +#include +#include + +#include +#include +#include +#include + +namespace shogun +{ + /** + * Reads OpenML streams which can be downloaded with this function. + */ + class OpenMLFile + { + + public: + explicit OpenMLFile(const std::string& api_key) : m_api_key(api_key) + { + } + + /** + * Returns a string returned by the server given a request. + * Raises an error if the returned code is not 200. + * Additional arguments can be passed to the request, + * which are then concatenated with a "/" character. + * + * @tparam Args argument type pack, should all be std::string + * @param request the request name, see m_request_options + * @param format the format to return the data in, see m_format_options + * @param args the additional arguments to be passed to request + * @return the returned stream from the server if the return code is 200 + */ + template + std::string + get(const std::string& request, const std::string& format, Args... args) + { + std::string request_path; + auto find_format = m_format_options.find(format); + if (find_format == m_format_options.end()) + { + SG_SERROR( + "The provided format \"%s\" is not available\n", + format.c_str()) + } + + if (format == "split") + { + REQUIRE( + request == "get_split", + "Split server can only handle \"get_split\" request.\n") + request_path = get_split; + } + else + { + auto find_request = m_request_options.find(request); + if (find_request == m_request_options.end()) + { + SG_SERROR( + "Could not find a way to solve the request \"%s\"\n", + request.c_str()) + } + request_path = find_request->second; + } + + std::string request_format = find_format->second; + + // get additional args and concatenate them with "/" + if (sizeof...(Args) > 0) + { + if (request_path.substr(request_path.size() - 2) == "{}") + { + request_path = + request_path.substr(0, request_path.size() - 2); + } + else + { + SG_SERROR( + "The provided request \"%s\" cannot handle additional " + "args.\n", + request.c_str()) + } + std::vector args_vec = {args...}; + std::string args_string = std::accumulate( + args_vec.begin() + 1, args_vec.end(), args_vec.front(), + [](std::string s0, std::string& s1) { + return s0 += "/" + encode_string(s1); + }); + request_path += args_string; + } + + std::string url = request_format + request_path + "?" + m_api_key; + + return get(url); + } + + std::string get(const std::string& url) + { +#ifdef HAVE_CURL + // clear the buffer before request + m_curl_response_buffer.clear(); + + openml_curl_request_helper(url); + return m_curl_response_buffer; +#else + SG_SERROR( + "Please compile shogun with libcurl to query the OpenML server!\n") +#endif // HAVE_CURL + } + + private: + + static std::string encode_string(const std::string& s); + + /** the raw buffer as a C++ string */ + std::string m_curl_response_buffer; + + /** + * Initialises CURL session and gets the data. + * This function also handles the response code from the server. + * + * @param url the url to query + */ + void openml_curl_request_helper(const std::string& url); + + /** the user API key, not required for all requests */ + std::string m_api_key; + + /** the server path to get a response in XML format*/ + static const char* xml_server; + /** the server path to get a response in JSON format*/ + static const char* json_server; + /** the server path to download datasets */ + static const char* download_server; + /** the server path to get a split in ARFF format */ + static const char* splits_server; + + /** the server response format options: XML or JSON */ + static const std::unordered_map + m_format_options; + /** all the supported server options */ + static const std::unordered_map + m_request_options; + + /* DATA API */ + static const char* dataset_description; + static const char* list_data_qualities; + static const char* data_features; + static const char* data_qualities; + static const char* list_dataset_qualities; + static const char* list_dataset_filter; + + /* FLOW API */ + static const char* flow_file; + static const char* flow_exists; + + /* TASK API */ + static const char* task_file; + + /* SPLIT API */ + static const char* get_split; + }; + + /** + * Writes OpenML streams to the OpenML server. + */ + class OpenMLWritter + { + public: + OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; + + template + bool post(const std::string& request, const std::string& format, const std::string& message, Args... args); + + private: + /** the user API key, likely to be needed to write to OpenML */ + std::string m_api_key; + }; +} // namespace shogun + +#endif // SHOGUN_OPENMLREADER_H diff --git a/src/shogun/io/openml/OpenMLFlow.cpp b/src/shogun/io/openml/OpenMLFlow.cpp new file mode 100644 index 00000000000..e36ecd251e3 --- /dev/null +++ b/src/shogun/io/openml/OpenMLFlow.cpp @@ -0,0 +1,241 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include + +#include +#include +#include + +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + +/** + * + */ +class ShogunToString : public AnyVisitor +{ +public: + explicit ShogunToString(){SG_SDEBUG("Debugging ShogunToString\n")}; + + void on(bool* v) final + { + m_string_val = (*v ? "true" : "false"); + } + void on(int32_t* v) final + { + m_string_val = std::to_string(*v); + } + void on(int64_t* v) final + { + m_string_val = std::to_string(*v); + } + void on(float* v) final + { + m_string_val = std::to_string(*v); + } + void on(double* v) final + { + m_string_val = std::to_string(*v); + } + void on(long double* v) + { + m_string_val = std::to_string(*v); + } + void on(CSGObject** v) final + { + m_string_val = ""; + } + void on(SGVector* v) final + { + m_string_val = v->to_string(); + } + void on(SGVector* v) final + { + m_string_val = v->to_string(); + } + void on(SGVector* v) final + { + m_string_val = v->to_string(); + } + void on(SGMatrix* mat) final + { + m_string_val = mat->to_string(); + } + void on(SGMatrix* mat) final + { + m_string_val = mat->to_string(); + } + void on(SGMatrix* mat) final + { + m_string_val = mat->to_string(); + } + + SG_FORCED_INLINE std::string get_string_value() const noexcept + { + return m_string_val; + } + +private: + std::string m_string_val; +}; + +std::shared_ptr OpenMLFlow::download_flow( + const std::string& flow_id, const std::string& api_key) +{ + parameters_type params; + components_type components; + + // get flow and parse with RapidJSON + auto reader = OpenMLFile(api_key); + auto return_string = reader.get("flow_file", "json", flow_id); + + auto& root = check_response(return_string, "flow"); + + std::string name = + return_if_possible("name", root.GetObject()); + std::string description = + return_if_possible("description", root.GetObject()); + std::string class_name = + return_if_possible("class_name", root.GetObject()); + std::string external_version = + return_if_possible("external_version", root.GetObject()); + + REQUIRE( + root["id"].GetString() == flow_id, + "The flow id returned by the server does not match the id provided. " + "Got %s instead of %s.\n", + root["id"].GetString(), flow_id.c_str()) + + // handle parameters + if (root.HasMember("parameter")) + { + std::unordered_map param_dict; + OpenMLFlowParameter params_i{}; + + if (root["parameter"].IsArray()) + { + for (const auto& v : root["parameter"].GetArray()) + { + add_string_to_struct(v, "name", params_i.name); + add_string_to_struct(v, "data_type", params_i.data_type); + add_string_to_struct( + v, "default_value", params_i.default_value); + add_string_to_struct(v, "description", params_i.description); + params.emplace(params_i.name, params_i); + } + } + else + { + // parameter can also be a dict, instead of array + const auto& v = root["parameter"].GetObject(); + add_string_to_struct(v, "name", params_i.name); + add_string_to_struct(v, "data_type", params_i.data_type); + add_string_to_struct(v, "default_value", params_i.default_value); + add_string_to_struct(v, "description", params_i.description); + params.emplace(params_i.name, params_i); + } + } + + // handle components, i.e. kernels + if (root.HasMember("component")) + { + if (root["component"].IsArray()) + { + for (const auto& v : root["component"].GetArray()) + { + components.emplace( + must_return("identifier", v), + OpenMLFlow::download_flow( + v["flow"]["id"].GetString(), api_key)); + } + } + else + { + components.emplace( + must_return("identifier", root["component"]), + OpenMLFlow::download_flow( + root["component"]["flow"]["id"].GetString(), api_key)); + } + } + + auto flow = std::make_shared( + flow_id, name, description, class_name, external_version, components, + params); + + return flow; +} + +void OpenMLFlow::upload_flow(const std::shared_ptr& flow) +{ + SG_SNOTIMPLEMENTED; +} + +void OpenMLFlow::dump() const +{ + SG_SNOTIMPLEMENTED; +} + +std::shared_ptr OpenMLFlow::from_file() +{ + SG_SNOTIMPLEMENTED; + return std::shared_ptr(); +} + +bool OpenMLFlow::exists_on_server() +{ + // check if flow with same name and version exists + auto reader = std::make_unique(""); + auto flow_exists_string = + reader->get("flow_exists", "json", m_name, m_external_version); + + auto& root = + check_response(flow_exists_string, "flow_exists"); + + return strcmp(root["exists"].GetString(), "true") == 0; +} + +std::vector> +OpenMLFlow::obtain_parameter_values(const std::shared_ptr& model) +{ + std::vector> result; + auto obj_param = model->get_params(); + auto visitor = std::make_unique(); + + result.reserve(m_parameters.size()); + + for (const auto& param : m_parameters) + { + Any any_val = obj_param.at(param.first)->get_value(); + any_val.visit(visitor.get()); + // nested objects are handled below + if (!visitor->get_string_value().empty()) + { + // result.emplace_back to call OpenMLParameterValues constructor + // doesn't work here, so create a temporary value with make_shared + // and then push_back + auto val = std::make_shared( + param.first, m_flow_id, visitor->get_string_value()); + result.push_back(val); + } + } + + for (const auto& components : m_components) + { + // TODO: remove std::shared_ptr when smart pointers available + auto obj = std::shared_ptr(model->get(components.first)); + auto val = std::make_shared( + components.first, m_flow_id, + components.second->obtain_parameter_values(obj)); + result.push_back(val); + } + + return result; +} diff --git a/src/shogun/io/openml/OpenMLFlow.h b/src/shogun/io/openml/OpenMLFlow.h new file mode 100644 index 00000000000..7ff9f4ff5e0 --- /dev/null +++ b/src/shogun/io/openml/OpenMLFlow.h @@ -0,0 +1,207 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLFLOW_H +#define SHOGUN_OPENMLFLOW_H + +#include +#include + +#include +#include + +namespace shogun +{ + /** + * OpenML representation of a parameter + */ + struct OpenMLParameterValues + { + /** name of parameter */ + std::string name; + /** id of flow parameter belongs to */ + std::string component; + /** value of parameter or a nested object */ + shogun::variant< + std::string, std::vector>> + value; + + OpenMLParameterValues( + const std::string& name_, const std::string& component_, + shogun::variant< + std::string, + std::vector>> + value_) + : name(name_), component(component_), value(std::move(value_)) + { + } + }; + + /** + * OpenML description of a flow parameter + */ + struct OpenMLFlowParameter + { + /** name of parameter */ + std::string name; + /** primitive type of parameter */ + std::string data_type; + /** the default value, i.e. the value published to OpenML */ + std::string default_value; + /** the parameter description */ + std::string description; + }; + /** + * Handles OpenML flows. A flow contains the information + * required to instantiate a model. + */ + class OpenMLFlow + { + + public: + /** alias for component type, map of flows */ + using components_type = + std::unordered_map>; + /** alias for parameter type, map of maps with information specific to a + * parameter */ + using parameters_type = std::unordered_map; + + /** + * The OpenMLFlow constructor. This constructor is rarely used by the + * user and is used by the static class members download_flow and + * from_file. The user is expected to use either of the previously + * mentioned functions. + * + * @param name the model name + * @param description the model description + * @param model the flow class_name field + * @param components a map of subflows, i.e. kernels + * @param parameters a map of parameter information, i.e. default values + * for each parameter name + */ + OpenMLFlow( + const std::string& flow_id, const std::string& name, + const std::string& description, const std::string& model, + const std::string& external_version, components_type components, + parameters_type parameters) + : m_flow_id(flow_id), m_name(name), m_description(description), + m_class_name(model), m_external_version(external_version), + m_parameters(std::move(parameters)), + m_components(std::move(components)) + { + } + + /** + * Instantiates a OpenMLFlow by downloaded a flow from the OpenML + * server. + * + * @param flow_id the flow ID + * @param api_key the user API key (might not be required and can be an + * empty string) + * @return the OpenMLFlow corresponding to the flow requested + * @throws ShogunException when there is a server error or the requested + * flow is ill formed. + */ + static std::shared_ptr + download_flow(const std::string& flow_id, const std::string& api_key); + + /** + * Instantiates a OpenMLFlow from a file. + * @return the OpenMLFlow corresponding to the flow requested + */ + static std::shared_ptr from_file(); + + /** + * Publishes a flow to the OpenML server + * @param flow the flow to be published + */ + static void upload_flow(const std::shared_ptr& flow); + + /** + * Dumps the OpenMLFlow to disk. + */ + void dump() const; + + bool exists_on_server(); + + std::vector> + obtain_parameter_values(const std::shared_ptr& model); + + /** + * Gets a subflow, i.e. a kernel in a machine + * @param name the name of the subflow, not the flow ID + * @return the subflow if it exists + */ + std::shared_ptr get_subflow(const std::string& name) + { + auto find_flow = m_components.find(name); + if (find_flow != m_components.end()) + return find_flow->second; + else + SG_SERROR( + "The provided subflow could not be found in this flow!") + return nullptr; + } + +#ifndef SWIG + SG_FORCED_INLINE parameters_type get_parameters() const noexcept + { + return m_parameters; + } + + SG_FORCED_INLINE components_type get_components() const noexcept + { + return m_components; + } + + SG_FORCED_INLINE std::string get_class_name() const noexcept + { + return m_class_name; + } + + SG_FORCED_INLINE std::string get_version() const noexcept + { + return m_external_version; + } + + SG_FORCED_INLINE std::string get_name() const noexcept + { + return m_name; + } + + SG_FORCED_INLINE std::string get_flow_id() const noexcept + { + return m_flow_id; + } + + SG_FORCED_INLINE void set_model(std::shared_ptr model) noexcept + { + m_model = model; + } + +#endif // SWIG + + private: + /** flow unique id */ + std::string m_flow_id; + /** name field of the flow */ + std::string m_name; + /** description field of the flow */ + std::string m_description; + /** the class_name field of the flow */ + std::string m_class_name; + /** the external version, e.g. "shogun=7.0.0" */ + std::string m_external_version; + /** the parameter field of the flow (optional) */ + parameters_type m_parameters; + /** the components fields of the flow (optional) */ + components_type m_components; + /** the associated model (optional) */ + std::shared_ptr m_model; + }; +} // namespace shogun + +#endif // SHOGUN_OPENMLFLOW_H diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp new file mode 100644 index 00000000000..8cd761d6015 --- /dev/null +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -0,0 +1,126 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include +#include +#include +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + +std::shared_ptr OpenMLRun::run_model_on_task( + std::shared_ptr model, std::shared_ptr task) +{ + SG_SNOTIMPLEMENTED + return std::shared_ptr(); +} + +std::shared_ptr OpenMLRun::run_flow_on_task( + std::shared_ptr flow, std::shared_ptr task, + bool avoid_duplicate_runs) +{ + if (avoid_duplicate_runs && flow->exists_on_server()) + { + auto flow_from_server = + OpenMLFlow::download_flow(flow->get_flow_id(), ""); + } + + auto data = task->get_dataset(); + + auto model = ShogunOpenML::flow_to_model(flow, true); + flow->set_model(model); + + auto labels = data->get_labels(); + auto features = data->get_features(data->get_default_target_attribute()); + + auto machine = std::dynamic_pointer_cast(model); + if (!machine) + { + SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") + } + + auto xval_storage = std::make_shared(); + + if (task->get_split()->contains_splits()) + { + auto train_idx = task->get_train_indices(); + auto test_idx = task->get_test_indices(); + + xval_storage->set_num_runs(task->get_num_repeats()); + xval_storage->set_num_folds(task->get_num_fold()); + + machine->set_store_model_features(true); + + for (auto repeat_idx : range(task->get_num_repeats())) + { + for (auto fold_idx : range(task->get_num_fold())) + { + SGVector train_i_idx( + train_idx[repeat_idx][fold_idx].begin(), + train_idx[repeat_idx][fold_idx].end()); + SGVector test_i_idx( + test_idx[repeat_idx][fold_idx].begin(), + test_idx[repeat_idx][fold_idx].end()); + + xval_storage->append_fold_result( + ShogunOpenML::run_model_on_fold( + machine, task, features, labels, train_i_idx, + test_i_idx, repeat_idx, fold_idx) + .release()); + } + } + } + else + { + xval_storage->set_num_runs(0); + xval_storage->set_num_folds(0); + xval_storage->append_fold_result( + ShogunOpenML::run_model_on_fold(machine, task, features, labels) + .release()); + } + SG_SDEBUG("End of openml run: %s\n", xval_storage->to_string().c_str()); + + return std::make_shared( + std::string{}, // uploader + std::string{}, // uploader_name + std::string{}, // setup_id + std::string{}, // setup_string + std::string{}, // parameter_settings + xval_storage, // xval_storage + std::string{}, // data_content + std::vector{}, // output_files + task, // task + flow, // flow + std::string{}, // run_id + model, // model + std::vector{}, // tags + std::string{} // predictions_url + ); +} + +std::shared_ptr +OpenMLRun::from_filesystem(const std::string& directory) +{ + SG_SNOTIMPLEMENTED + return nullptr; +} + +void OpenMLRun::to_filesystem(const std::string& directory) const +{ + SG_SNOTIMPLEMENTED +} + +void OpenMLRun::publish() const {SG_SNOTIMPLEMENTED} + +std::unique_ptr OpenMLRun::to_xml() const +{ + + return std::unique_ptr(); +} diff --git a/src/shogun/io/openml/OpenMLRun.h b/src/shogun/io/openml/OpenMLRun.h new file mode 100644 index 00000000000..86e3379be31 --- /dev/null +++ b/src/shogun/io/openml/OpenMLRun.h @@ -0,0 +1,77 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLRUN_H +#define SHOGUN_OPENMLRUN_H + +#include +#include + +#include +#include + +namespace shogun { + class OpenMLRun + { + public: + OpenMLRun( + const std::string& uploader, const std::string& uploader_name, + const std::string& setup_id, const std::string& setup_string, + const std::string& parameter_settings, + std::shared_ptr xval_storage, + const std::string& data_content, + std::vector output_files, + std::shared_ptr task, std::shared_ptr flow, + const std::string& run_id, std::shared_ptr model, + std::vector tags, std::string predictions_url) + : m_uploader(uploader), m_uploader_name(uploader_name), + m_setup_id(setup_id), m_setup_string(setup_string), + m_parameter_settings(parameter_settings), + m_xval_storage(xval_storage), + m_data_content(data_content), + m_output_files(std::move(output_files)), m_task(std::move(task)), + m_flow(std::move(flow)), m_run_id(run_id), + m_model(std::move(model)), m_tags(std::move(tags)), + m_predictions_url(std::move(predictions_url)) + { + } + + static std::shared_ptr + from_filesystem(const std::string& directory); + + static std::shared_ptr run_flow_on_task( + std::shared_ptr flow, + std::shared_ptr task, + bool avoid_duplicate_runs=true); + + static std::shared_ptr run_model_on_task( + std::shared_ptr model, std::shared_ptr task); + + void to_filesystem(const std::string& directory) const; + + std::unique_ptr to_xml() const; + + void publish() const; + + private: + std::string m_uploader; + std::string m_uploader_name; + std::string m_setup_id; + std::string m_setup_string; + std::string m_parameter_settings; + std::shared_ptr m_xval_storage; + std::string m_data_content; + std::vector m_output_files; + std::shared_ptr m_task; + std::shared_ptr m_flow; + std::string m_run_id; + std::shared_ptr m_model; + std::vector m_tags; + std::string m_predictions_url; + }; +} + +#endif //SHOGUN_OPENMLRUN_H diff --git a/src/shogun/io/openml/OpenMLSplit.cpp b/src/shogun/io/openml/OpenMLSplit.cpp new file mode 100644 index 00000000000..04d6206ed5f --- /dev/null +++ b/src/shogun/io/openml/OpenMLSplit.cpp @@ -0,0 +1,95 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +#include +#include +#include + +using namespace shogun; + +std::shared_ptr +OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) +{ + auto reader = OpenMLFile(api_key); + auto return_string = reader.get(split_url); + + if (return_string == "Task not providing datasplits.") + return std::make_shared(); + + std::shared_ptr return_stream = + std::make_shared(return_string); + auto arff_parser = ARFFDeserializer(return_stream); + arff_parser.read(); + auto arff_features = arff_parser.get_features(); + REQUIRE( + arff_features.size() == 4, + "Expected a ARFF file with 4 attributes: type, rowid, repeat and " + "fold.\n") + + auto type_vector = nominal_feature_to_vector(arff_features[0]); + auto rowid_vector = dense_feature_to_vector(arff_features[1]); + auto repeat_vector = dense_feature_to_vector(arff_features[2]); + auto fold_vector = dense_feature_to_vector(arff_features[3]); + + std::array, 3> train_idx, test_idx; + + for (int i = 0; i < arff_features[0]->get_num_vectors(); ++i) + { + if (type_vector[i] == LabelType::TRAIN) + { + train_idx[0].push_back(rowid_vector[i]); + train_idx[1].push_back(repeat_vector[i]); + train_idx[2].push_back(fold_vector[i]); + } + else + { + test_idx[0].push_back(rowid_vector[i]); + test_idx[1].push_back(repeat_vector[i]); + test_idx[2].push_back(fold_vector[i]); + } + } + + return std::make_shared(train_idx, test_idx); +} + +SGMatrix +OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = + std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CDenseFeatures!\n>"); + return casted_feat->get_feature_matrix(); +} + +std::vector +OpenMLSplit::nominal_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = + std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CDenseFeatures!\n"); + + std::vector result; + + for (const auto& el : casted_feat->get_feature_matrix()) + { + if (el == 0) + result.push_back(LabelType::TRAIN); + else + result.push_back(LabelType::TEST); + } + + return result; +} \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLSplit.h b/src/shogun/io/openml/OpenMLSplit.h new file mode 100644 index 00000000000..f032ee91647 --- /dev/null +++ b/src/shogun/io/openml/OpenMLSplit.h @@ -0,0 +1,121 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLSPLIT_H +#define SHOGUN_OPENMLSPLIT_H + +#include +#include + +#include + +namespace shogun +{ + /** + * Handles an OpenML split. + */ + class OpenMLSplit + { + public: + enum class LabelType + { + TRAIN = 1, + TEST = 2 + }; + + /** + * Default constructor. This is used when there are no + * train or test indices. + */ + OpenMLSplit() : m_repeat_count(0), m_fold_count(0) + { + } + + OpenMLSplit( + const std::array, 3>& train_idx, + const std::array, 3>& test_idx) + : m_train_idx(train_idx), m_test_idx(test_idx) + { + // repeats and folds are zero indexed so add 1 + // we also assume that the repeats and folds indices go from + // 0,1,...,N in increments of 1 + m_data_count = + std::make_pair(train_idx[0].size(), test_idx[0].size()); + m_repeat_count = + *std::max_element(train_idx[1].begin(), train_idx[1].end()) + 1; + m_fold_count = + *std::max_element(train_idx[2].begin(), train_idx[2].end()) + 1; + auto test_repeat_count = + *std::max_element(test_idx[1].begin(), test_idx[1].end()) + 1; + auto test_fold_count = + *std::max_element(test_idx[2].begin(), test_idx[2].end()) + 1; + + REQUIRE( + train_idx[0].size() == train_idx[1].size() && + train_idx[0].size() == train_idx[2].size(), + "All dimensions in train_idx must match!\n") + REQUIRE( + test_idx[0].size() == test_idx[1].size() && + test_idx[0].size() == test_idx[2].size(), + "All dimensions in test_idx must match!\n") + + if (m_repeat_count != test_repeat_count) + SG_SERROR( + "Expected the train and test set to have the same number " + "of repeats, but got %d and %d respectively.\n", + m_repeat_count, test_repeat_count) + if (m_repeat_count != test_repeat_count) + SG_SERROR( + "Expected the train and test set to have the same number " + "of folds, but got %d and %d respectively.\n", + m_fold_count, test_fold_count) + } + + static std::shared_ptr + get_split(const std::string& split_url, const std::string& api_key); + + SG_FORCED_INLINE std::array, 3> + get_train_idx() const noexcept + { + return m_train_idx; + } + + SG_FORCED_INLINE std::array, 3> + get_test_idx() const noexcept + { + return m_test_idx; + } + + SG_FORCED_INLINE bool contains_splits() const noexcept + { + return !m_train_idx[0].empty() && !m_test_idx[0].empty(); + } + + SG_FORCED_INLINE int32_t get_num_repeats() const noexcept + { + return m_repeat_count; + } + + SG_FORCED_INLINE int32_t get_num_folds() const noexcept + { + return m_fold_count; + } + + private: + static SGMatrix + dense_feature_to_vector(const std::shared_ptr& feat); + + static std::vector + nominal_feature_to_vector(const std::shared_ptr& feat); + + std::array, 3> m_train_idx; + std::array, 3> m_test_idx; + std::pair m_data_count; + int32_t m_repeat_count; + int32_t m_fold_count; + }; +} // namespace shogun +#endif // SHOGUN_OPENMLSPLIT_H diff --git a/src/shogun/io/openml/OpenMLTask.cpp b/src/shogun/io/openml/OpenMLTask.cpp new file mode 100644 index 00000000000..f41bd3d0cc7 --- /dev/null +++ b/src/shogun/io/openml/OpenMLTask.cpp @@ -0,0 +1,171 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + +struct DatasetStruct +{ + std::string dataset_id; + std::string target_feature; +}; + +struct SplitStruct +{ + std::string split_id; + std::string type; + std::string data_splits_url; + std::unordered_map split_parameters; +}; + +std::shared_ptr +OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) +{ + std::string task_name; + std::string task_type_id; + std::unordered_map evaluation_measures; + + DatasetStruct dataset_struct{}; + SplitStruct split_struct{}; + + auto reader = OpenMLFile(api_key); + auto return_string = reader.get("task_file", "json", task_id); + + auto& root = check_response(return_string, "task"); + + REQUIRE( + task_id == root["task_id"].GetString(), + "Expected downloaded task to have the same id as the requested task " + "id, but got \"%s\", instead of \"%s\".\n", + root["task_id"].GetString(), task_id.c_str()) + + task_name = root["task_name"].GetString(); + OpenMLTask::TaskType task_type = + get_task_from_string(root["task_type"].GetString()); + task_type_id = root["task_type_id"].GetString(); + + // expect two elements in input array: dataset and split + auto& json_input = root["input"]; + + auto input_array = json_input.GetArray(); + + for (const auto& task_settings : input_array) + { + if (strcmp(task_settings["name"].GetString(), "source_data") == 0) + { + auto dataset_info = task_settings["data_set"].GetObject(); + add_string_to_struct( + dataset_info, "data_set_id", dataset_struct.dataset_id); + add_string_to_struct( + dataset_info, "target_feature", dataset_struct.target_feature); + } + else if ( + strcmp(task_settings["name"].GetString(), "estimation_procedure") == + 0) + { + auto split_info = task_settings["estimation_procedure"].GetObject(); + add_string_to_struct(split_info, "id", split_struct.split_id); + add_string_to_struct(split_info, "type", split_struct.type); + add_string_to_struct( + split_info, "data_splits_url", split_struct.data_splits_url); + + std::unordered_map split_parameters; + for (const auto& param : split_info["parameter"].GetArray()) + { + if (param.HasMember("name") && param.HasMember("value")) + split_parameters.emplace( + param["name"].GetString(), param["value"].GetString()); + else if (param.HasMember("name")) + split_parameters.emplace(param["name"].GetString(), ""); + else + SG_SERROR( + "Unexpected number of parameters in parameter array " + "of estimation_procedure.\n") + } + split_struct.split_parameters = split_parameters; + + REQUIRE( + split_struct.type == "crossvalidation", + "Currently only tasks with cross validation are enabled in " + "shogun!\n") + } + else if ( + strcmp(task_settings["name"].GetString(), "evaluation_measures") == + 0) + { + auto evaluation_info = + task_settings["evaluation_measures"].GetObject(); + for (const auto& param : evaluation_info) + { + if (param.value.IsString()) + evaluation_measures.emplace( + param.name.GetString(), param.value.GetString()); + else + evaluation_measures.emplace(param.name.GetString(), ""); + } + } + } + + REQUIRE( + !dataset_struct.dataset_id.empty(), + "The dataset ID is required to retrieve the dataset!\n") + auto openml_dataset = + OpenMLData::get_dataset(dataset_struct.dataset_id, api_key); + + REQUIRE( + !split_struct.data_splits_url.empty(), + "The split URL is required to retrieve the split information!\n") + auto openml_split = + OpenMLSplit::get_split(split_struct.data_splits_url, api_key); + + auto result = std::make_shared( + task_id, task_name, task_type, task_type_id, evaluation_measures, + openml_split, openml_dataset); + + return result; +} + +OpenMLTask::TaskType +OpenMLTask::get_task_from_string(const std::string& task_type) +{ + if (task_type == "Supervised Classification") + return OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION; + SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) +} + +std::vector>> +OpenMLTask::get_train_indices() const +{ + return get_indices(m_split->get_train_idx()); +} + +std::vector>> +OpenMLTask::get_test_indices() const +{ + return get_indices(m_split->get_test_idx()); +} + +std::vector>> +OpenMLTask::get_indices(const std::array, 3>& idx) const +{ + // result = (n_repeats, n_folds, ?) where ? is the number of indices in a + // given fold + std::vector>> result( + m_split->get_num_repeats(), + std::vector>( + m_split->get_num_folds(), std::vector{})); + for (int i = 0; i < idx[0].size(); ++i) + { + // result[repeat][fold].push_back(data_index) + result[idx[1][i]][idx[2][i]].push_back(idx[0][i]); + } + return result; +} \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLTask.h b/src/shogun/io/openml/OpenMLTask.h new file mode 100644 index 00000000000..c248ee7d109 --- /dev/null +++ b/src/shogun/io/openml/OpenMLTask.h @@ -0,0 +1,108 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLTASK_H +#define SHOGUN_OPENMLTASK_H + +#include +#include + +namespace shogun +{ + /** + * Handles OpenML tasks. A task contains all the information + * required to train and test a model. + */ + class OpenMLTask + { + public: + enum class TaskType + { + SUPERVISED_CLASSIFICATION = 0, + SUPERVISED_REGRESSION = 1, + LEARNING_CURVE = 2, + SUPERVISED_DATASTREAM_CLASSIFICATION = 3, + CLUSTERING = 4, + MACHINE_LEARNING_CHALLENGE = 5, + SURVIVAL_ANALYSIS = 6, + SUBGROUP_DISCOVERY = 7 + }; + + enum class TaskEvaluation + { + + }; + + OpenMLTask( + const std::string& task_id, const std::string task_name, + TaskType task_type, const std::string& task_type_id, + std::unordered_map evaluation_measures, + std::shared_ptr split, + std::shared_ptr data) + : m_task_id(task_id), m_task_name(task_name), + m_task_type(task_type), m_task_type_id(task_type_id), + m_evaluation_measures(std::move(evaluation_measures)), + m_split(std::move(split)), m_data(std::move(data)) + { + } + + static std::shared_ptr + get_task(const std::string& task_id, const std::string& api_key); + + std::shared_ptr get_dataset() const noexcept + { + return m_data; + } + + std::shared_ptr get_split() const noexcept + { + return m_split; + } + + std::vector>> + get_train_indices() const; + + std::vector>> get_test_indices() const; + + int32_t get_num_fold() const noexcept + { + return m_split->get_num_folds(); + } + + int32_t get_num_repeats() const noexcept + { + return m_split->get_num_repeats(); + } + + std::string get_task_name() const noexcept + { + return m_task_name; + } + +#ifndef SWIG + SG_FORCED_INLINE TaskType get_task_type() const noexcept + { + return m_task_type; + } +#endif // SWIG + + private: + static TaskType get_task_from_string(const std::string& task_type); + + std::vector>> + get_indices(const std::array, 3>& idx) const; + + std::string m_task_id; + std::string m_task_name; + TaskType m_task_type; + std::string m_task_type_id; + std::unordered_map m_evaluation_measures; + std::shared_ptr m_split; + std::shared_ptr m_data; + }; +} // namespace shogun + +#endif // SHOGUN_OPENMLTASK_H diff --git a/src/shogun/io/openml/ShogunOpenML.cpp b/src/shogun/io/openml/ShogunOpenML.cpp new file mode 100644 index 00000000000..87987e3036f --- /dev/null +++ b/src/shogun/io/openml/ShogunOpenML.cpp @@ -0,0 +1,532 @@ +#include + +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +#include + +using namespace shogun; + +/** + * Class using the Any visitor pattern to convert + * a string to a C++ type that can be used as a parameter + * in a Shogun model. If the string value is not "null" it will + * be put in its casted type in the given model with the provided parameter + * name. If the value is null nothing happens, i.e. no error is thrown + * and no value is put in model. + */ +class StringToShogun : public AnyVisitor +{ +public: + explicit StringToShogun(std::shared_ptr model) + : m_model(std::move(model)), m_parameter(""), + m_string_val(""){SG_SDEBUG("Debugging StringToShogun\n")}; + + StringToShogun( + std::shared_ptr model, const std::string& parameter, + const std::string& string_val) + : m_model(std::move(model)), m_parameter(parameter), + m_string_val(string_val){SG_SDEBUG("Debugging StringToShogun\n")}; + + void on(bool* v) final + { + SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + bool result = strcmp(m_string_val.c_str(), "true") == 0; + m_model->put(m_parameter, result); + } + } + void on(int32_t* v) final + { + SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + try + { + int32_t result = std::stoi(m_string_val); + m_model->put(m_parameter, result); + } + catch (const std::invalid_argument&) + { + // it's an option, i.e. internally represented + // as an enum but in swig exposed as a string + m_string_val.erase( + std::remove_if( + m_string_val.begin(), m_string_val.end(), + // remove quotes + [](const auto& val) { return val == '\"'; }), + m_string_val.end()); + m_model->put(m_parameter, m_string_val); + } + } + } + void on(int64_t* v) final + { + SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + + int64_t result = std::stol(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(float* v) final + { + SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + float32_t result = std::stof(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(double* v) final + { + SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + float64_t result = std::stod(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(long double* v) + { + SG_SDEBUG( + "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + floatmax_t result = std::stold(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(CSGObject** v) final + { + SG_SDEBUG( + "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) + } + void on(SGMatrix* mat) final{SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} + + /** + * In OpenML "null" is an empty parameter value field. + * @return whether the field is "null" + */ + SG_FORCED_INLINE bool is_null() const noexcept + { + bool result = strcmp(m_string_val.c_str(), "null") == 0; + return result; + } + + SG_FORCED_INLINE void set_parameter_name(const std::string& name) noexcept + { + m_parameter = name; + } + + SG_FORCED_INLINE void set_string_value(const std::string& value) noexcept + { + m_string_val = value; + } + +private: + std::shared_ptr m_model; + std::string m_parameter; + std::string m_string_val; +}; + +/** + * Instantiates a CSGObject using a factory + * @param factory_name the name of the factory + * @param algo_name the name of algorithm passed to factory + * @return the instantiated object using a factory + */ +std::shared_ptr instantiate_model_from_factory( + const std::string& factory_name, const std::string& algo_name) +{ + if (factory_name == "machine") + return std::shared_ptr(machine(algo_name)); + if (factory_name == "kernel") + return std::shared_ptr(kernel(algo_name)); + if (factory_name == "distance") + return std::shared_ptr(distance(algo_name)); + + SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + + return nullptr; +} + +/** + * Downcasts a CSGObject and puts it in the map of obj. + * @param obj the main object + * @param nested_obj the object to be casted and put in the obj map. + * @param parameter_name the name of nested_obj + */ +void cast_and_put( + const std::shared_ptr& obj, + const std::shared_ptr& nested_obj, + const std::string& parameter_name) +{ + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + // TODO: remove clone + // temporary fix until shared_ptr PR merged + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + return; + } + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + return; + } + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + return; + } + SG_SERROR("Could not cast SGObject.\n") +} + +std::shared_ptr ShogunOpenML::flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults) +{ + auto params = flow->get_parameters(); + auto components = flow->get_components(); + auto class_name = get_class_info(flow->get_class_name()); + auto module_name = class_name.first; + auto algo_name = class_name.second; + + auto obj = instantiate_model_from_factory(module_name, algo_name); + auto obj_param = obj->get_params(); + + auto visitor = std::make_unique(obj); + + if (initialize_with_defaults) + { + for (const auto& param : params) + { + Any any_val = obj_param.at(param.first)->get_value(); + std::string name = param.first; + std::string val_as_string = param.second.default_value; + visitor->set_parameter_name(name); + visitor->set_string_value(val_as_string); + any_val.visit(visitor.get()); + } + } + + for (const auto& component : components) + { + std::shared_ptr nested_obj = + flow_to_model(component.second, initialize_with_defaults); + cast_and_put(obj, nested_obj, component.first); + } + + SG_SDEBUG("Final object: %s.\n", obj->to_string().c_str()); + + return obj; +} + +std::shared_ptr +ShogunOpenML::model_to_flow(const std::shared_ptr& model) +{ + return std::shared_ptr(); +} + +std::pair +ShogunOpenML::get_class_info(const std::string& class_name) +{ + std::vector class_components; + auto begin = class_name.begin(); + std::pair result; + + for (auto it = class_name.begin(); it != class_name.end(); ++it) + { + if (*it == '.') + { + class_components.emplace_back(std::string(begin, it)); + begin = std::next(it); + } + if (std::next(it) == class_name.end()) + class_components.emplace_back(std::string(begin, std::next(it))); + } + + if (class_components.empty()) + SG_SERROR( + "Error parsing flow class name \"%s\"!\n", class_name.c_str()); + + if (class_components.size() == 3 && class_components[0] == "shogun") + result = std::make_pair(class_components[1], class_components[2]); + else if (class_components.size() != 3 && class_components[0] == "shogun") + SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) + else + SG_SERROR( + "The provided flow is not meant for shogun deserialisation! The " + "required library is \"%s\".\n", + class_components[0].c_str()) + + return result; +} + +std::unique_ptr ShogunOpenML::run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels, const SGVector& train_idx, + const SGVector& test_idx, index_t repeat_idx, index_t fold_idx) +{ + auto task_type = task->get_task_type(); + + CEvaluation* evaluation_criterion = nullptr; + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + evaluation_criterion = new CAccuracyMeasure(); + break; + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + evaluation_criterion = new CMeanAbsoluteError(); + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + // copied/adapted from crossvalidation + auto fold = std::make_unique(); + auto* cloned_machine = machine->clone()->as(); + + // TODO while these are not used through const interfaces, + // we unfortunately have to clone, even though these could be + // shared + auto* features_clone = features->clone()->as(); + auto* labels_clone = labels->clone()->as(); + + /* evtl. update xvalidation output class */ + fold->set_run_index(repeat_idx); + fold->set_fold_index(fold_idx); + + /* set feature and label subset for training */ + features_clone->add_subset(train_idx); + labels_clone->add_subset(train_idx); + + SG_SDEBUG( + "train set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + train_idx.to_string().c_str()) + + /* train machine on training features and remove subset */ + SG_SDEBUG("starting training\n") + cloned_machine->set_labels(labels_clone); + cloned_machine->train(features_clone); + SG_SDEBUG("finished training\n") + + /* evtl. update xvalidation output class */ + fold->set_train_indices(train_idx); + auto* fold_machine = cloned_machine->clone()->as(); + fold->set_trained_machine(fold_machine); + SG_UNREF(fold_machine) + + features_clone->remove_subset(); + labels_clone->remove_subset(); + + /* set features and label subset for testing */ + features_clone->add_subset(test_idx); + labels_clone->add_subset(test_idx); + + SG_SDEBUG( + "test set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + test_idx.to_string().c_str()) + + /* apply machine to test features and remove subset */ + SG_SDEBUG("starting evaluation\n") + SG_SDEBUG("%p\n", features_clone) + CLabels* result_labels = cloned_machine->apply(features_clone); + SG_SDEBUG("finished evaluation\n") + features_clone->remove_subset(); + SG_REF(result_labels); + + /* evaluate */ + auto result = + evaluation_criterion->evaluate(result_labels, labels_clone); + SG_SINFO( + "result on repeat %d fold %d is %f\n", repeat_idx, fold_idx, result) + + /* evtl. update xvalidation output class */ + fold->set_test_indices(test_idx); + fold->set_test_result(result_labels); + auto* true_labels = (CLabels*)labels->clone(); + fold->set_test_true_result(true_labels); + SG_UNREF(true_labels) + fold->post_update_results(); + fold->set_evaluation_result(result); + + /* clean up, remove subsets */ + labels->remove_subset(); + SG_UNREF(cloned_machine); + SG_UNREF(features_clone); + SG_UNREF(labels_clone); + SG_UNREF(result_labels); + delete evaluation_criterion; + return fold; + } + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + return nullptr; +} + +std::unique_ptr ShogunOpenML::run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels) +{ + auto task_type = task->get_task_type(); + + CEvaluation* evaluation_criterion = nullptr; + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + evaluation_criterion = new CAccuracyMeasure(); + break; + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + evaluation_criterion = new CMeanAbsoluteError(); + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + auto fold = std::make_unique(); + auto* cloned_machine = machine->clone()->as(); + auto* features_clone = features->clone()->as(); + auto* labels_clone = labels->clone()->as(); + fold->set_run_index(0); + fold->set_fold_index(0); + + /* train machine on training features */ + SG_SDEBUG("starting training\n") + cloned_machine->set_labels(labels_clone); + cloned_machine->train(features_clone); + SG_SDEBUG("finished training\n") + + auto* fold_machine = cloned_machine->clone()->as(); + fold->set_trained_machine(fold_machine); + SG_UNREF(fold_machine) + + /* apply machine to test features */ + SG_SDEBUG("starting evaluation\n") + SG_SDEBUG("%p\n", features_clone) + CLabels* result_labels = cloned_machine->apply(features_clone); + SG_SDEBUG("finished evaluation\n") + + /* evaluate */ + auto result = + evaluation_criterion->evaluate(result_labels, labels_clone); + SG_SINFO("result is %f\n", result) + + /* evtl. update xvalidation output class */ + fold->set_test_result(result_labels); + auto* true_labels = (CLabels*)labels->clone(); + fold->set_test_true_result(true_labels); + SG_UNREF(true_labels) + fold->post_update_results(); + fold->set_evaluation_result(result); + + // cleanup + SG_UNREF(cloned_machine); + SG_UNREF(features_clone); + SG_UNREF(labels_clone); + SG_UNREF(result_labels); + delete evaluation_criterion; + return fold; + } + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + return nullptr; +} \ No newline at end of file diff --git a/src/shogun/io/openml/ShogunOpenML.h b/src/shogun/io/openml/ShogunOpenML.h new file mode 100644 index 00000000000..88f0e19a842 --- /dev/null +++ b/src/shogun/io/openml/ShogunOpenML.h @@ -0,0 +1,80 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_SHOGUNOPENML_H +#define SHOGUN_SHOGUNOPENML_H + +#include +#include + +#include +#include + +namespace shogun +{ + class OpenMLRun; + /** + * The Shogun OpenML extension to run models from an OpenMLFlow + * and convert models to OpenMLFlow. + */ + class ShogunOpenML + { + public: + friend class OpenMLRun; + /** + * Instantiates a SGObject from an OpenMLFlow. + * + * @param flow the flow to instantiate + * @param initialize_with_defaults whether to use the default values + * specified in the flow + * @return the flow as a trainable model + */ + static std::shared_ptr flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults); + + /** + * Converts a SGObject to an OpenMLFlow. + * + * @param model the model to convert + * @return the flow from the model conversion + */ + static std::shared_ptr + model_to_flow(const std::shared_ptr& model); + + protected: + static std::unique_ptr run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels, + const SGVector& train_idx, + const SGVector& test_id, + index_t repeat_number, + index_t fold_number); + + static std::unique_ptr run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels); + + private: + /** + * Helper function to extract module/factory information from the + * class name field of OpenMLFlow. Throws an error either if the + * class name field is ill formed (i.e. not + * library.module.algorithm) or if the library name is not "shogun". + * + * @param class_name the flow class_name field + * @return a tuple with the module name (factory string) and the + * algorithm name + */ + static std::pair + get_class_info(const std::string& class_name); + }; +} // namespace shogun + +#endif // SHOGUN_SHOGUNOPENML_H diff --git a/src/shogun/io/openml/utils.h b/src/shogun/io/openml/utils.h new file mode 100644 index 00000000000..b9f0d0ec1bf --- /dev/null +++ b/src/shogun/io/openml/utils.h @@ -0,0 +1,165 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENML_UTILS_H +#define SHOGUN_OPENML_UTILS_H + +#include + +#include + +namespace shogun +{ + namespace openml_detail + { + enum class BACKEND_FORMAT + { + JSON = 0, + XML = 1, + }; + + /** + * Checks the returned response from OpenML in JSON format + * @param doc the parsed OpenML JSON format response + */ + template < + BACKEND_FORMAT FormatT, + typename std::enable_if_t* = + nullptr> + const rapidjson::Value& + check_response(const std::string& val, const std::string& root_name) + { + rapidjson::Document doc; + doc.Parse(val.c_str()); + + if (SG_UNLIKELY(doc.HasMember("error"))) + { + const rapidjson::Value& root = doc["error"]; + SG_SERROR( + "Server error %s: %s\n", root["code"].GetString(), + root["message"].GetString()) + } + REQUIRE( + doc.HasMember(root_name.c_str()), + "Unexpected format of OpenML %s.\n", root_name.c_str()); + + return doc[root_name.c_str()]; + } + + /** + * Checks the returned response from OpenML in XML format + * @param doc the parsed OpenML XML format response + */ + template < + BACKEND_FORMAT FormatT, + typename std::enable_if_t* = + nullptr> + void check_response(const std::string& val, const std::string& type) + { + SG_SNOTIMPLEMENTED + } + + template + static SG_FORCED_INLINE void add_string_to_struct( + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v, + const std::string& name, T& custom_struct) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + custom_struct = v[name.c_str()].GetString(); + } + + template + static SG_FORCED_INLINE void add_string_to_struct( + const rapidjson::GenericValue>& v, + const std::string& name, T& custom_struct) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + custom_struct = v[name.c_str()].GetString(); + } + + template + SG_FORCED_INLINE T must_return( + const std::string& name, + const rapidjson::GenericValue>& v) + { + SG_SNOTIMPLEMENTED + } + + template <> + SG_FORCED_INLINE std::string must_return( + const std::string& name, + const rapidjson::GenericValue>& v) + { + if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) + return v[name.c_str()].GetString(); + if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) + SG_SERROR( + "Found member \"%s\" but it is not a string", name.c_str()) + if (!v.HasMember(name.c_str())) + SG_SERROR( + "\"%s\" is not a member of the given object", name.c_str()) + return nullptr; + } + + + template + SG_FORCED_INLINE T return_if_possible( + const std::string& name, + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v) + { + SG_SNOTIMPLEMENTED + } + + template <> + SG_FORCED_INLINE std::string return_if_possible( + const std::string& name, + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v) + { + if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) + return v[name.c_str()].GetString(); + if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) + SG_SERROR( + "Found member \"%s\" but it is not a string", name.c_str()) + if (!v.HasMember(name.c_str())) + return ""; + return nullptr; + } + + template <> + SG_FORCED_INLINE std::vector + return_if_possible>( + const std::string& name, + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v) + { + std::vector result; + if (!v.HasMember(name.c_str())) + SG_SERROR( + "\"%s\" is not a member of the given object", name.c_str()) + if (v[name.c_str()].IsString()) + { + result.emplace_back(v[name.c_str()].GetString()); + } + if (v[name.c_str()].IsArray()) + { + for (const auto& val : v[name.c_str()].GetArray()) + { + if (val.IsString()) + result.emplace_back(val.GetString()); + else + SG_SERROR( + "Found non string member in \"%s\".\n", + name.c_str()) + } + } + return result; + } + } // namespace openml_detail +} // namespace shogun +#endif // SHOGUN_OPENML_UTILS_H