From e6d5a2057fdf500581e9c7411546d1682a98d74b Mon Sep 17 00:00:00 2001 From: gf712 Date: Fri, 3 May 2019 14:48:18 +0100 Subject: [PATCH 01/32] initial openml code --- src/shogun/io/OpenmlFlow.cpp | 100 +++++++++++++++++++++++ src/shogun/io/OpenmlFlow.h | 154 +++++++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 src/shogun/io/OpenmlFlow.cpp create mode 100644 src/shogun/io/OpenmlFlow.h diff --git a/src/shogun/io/OpenmlFlow.cpp b/src/shogun/io/OpenmlFlow.cpp new file mode 100644 index 00000000000..f9cdb19af00 --- /dev/null +++ b/src/shogun/io/OpenmlFlow.cpp @@ -0,0 +1,100 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifdef HAVE_CURL + +#include +#include "OpenmlFlow.h" + + +using namespace shogun; + +size_t writer(char *data, size_t size, size_t nmemb, std::string* buffer_in) +{ + // adapted from https://stackoverflow.com/a/5780603 + // Is there anything in the buffer? + if (buffer_in->empty()) + { + // Append the data to the buffer + buffer_in->append(data, size * nmemb); + + return size * nmemb; + } + + return 0; +} + +const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; +const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::dataset_description = "/data/{}"; +const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; +const char* OpenMLReader::data_features = "/data/features/{}"; +const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; +const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; +const char* OpenMLReader::flow_file = "/flow/{}"; + +const std::unordered_map + OpenMLReader::m_format_options = {{"xml", xml_server}, + {"json", json_server}}; +const std::unordered_map + OpenMLReader::m_request_options = { + + {"dataset_description", dataset_description}, + {"list_data_qualities", list_data_qualities}, + {"data_features", data_features}, + {"list_dataset_qualities", list_dataset_qualities}, + {"list_dataset_filter", list_dataset_filter}, + {"flow_file", flow_file}}; + +OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) +{ +} + +void OpenMLReader::post(const std::string& request, const std::string& data) +{ +} + +void OpenMLReader::openml_curl_request_helper(const std::string& url) +{ + CURL* curl_handle = nullptr; + + curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.") + return; + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_HTTPGET,1); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); + + CURLcode res = curl_easy_perform(curl_handle); + + openml_curl_error_helper(res); + + curl_easy_cleanup(curl_handle); +} + +void OpenMLReader::openml_curl_error_helper(CURLcode code) { + +} + + +void OpenMLFlow::download_flow() +{ + + auto reader = OpenMLReader(m_api_key); + auto return_string = reader.get("flow_file", "json", m_flow_id); +} + +void OpenMLFlow::upload_flow(const OpenMLFlow& flow) +{ +} + +#endif // HAVE_CURL diff --git a/src/shogun/io/OpenmlFlow.h b/src/shogun/io/OpenmlFlow.h new file mode 100644 index 00000000000..6787df3e252 --- /dev/null +++ b/src/shogun/io/OpenmlFlow.h @@ -0,0 +1,154 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLFLOW_H +#define SHOGUN_OPENMLFLOW_H + +#ifdef HAVE_CURL + +#include + +#include +#include +#include +#include +#include +#include + +namespace shogun +{ + class OpenMLReader + { + + public: + explicit OpenMLReader(const std::string& api_key); + + /** + * Returns a string returned by the server given a request. + * Raises an error if the returned code is not 200. + * Additional arguments can be passed to the request, + * which are then concatenated with a "/" character. + * + * @tparam Args argument type pack, should all be std::string + * @param request the request name, see m_request_options + * @param format the format to return the data in, see m_format_options + * @param args the additional arguments to be passed to request + * @return the returned stream from the server if the return code is 200 + */ + template + std::string + get(const std::string& request, const std::string& format, Args... args) + { + auto find_format = m_format_options.find(format); + if (find_format == m_format_options.end()) + { + SG_SERROR( + "The provided format \"%s\" is not available\n", + format.c_str()) + } + auto find_request = m_request_options.find(request); + if (find_request == m_request_options.end()) + { + SG_SERROR( + "Could not find a way to solve the request \"%s\"\n", + request.c_str()) + } + std::string request_format = find_format->second; + std::string request_path = find_request->second; + + // get additional args and concatenate them with "/" + if (sizeof...(Args) > 0) + { + if (request_path.substr(request_path.size() - 2) == "{}") + { + request_path = + request_path.substr(0, request_path.size() - 2); + } + else + { + SG_SERROR( + "The provided request \"%s\" cannot handle additional " + "args.\n", + request.c_str()) + } + std::vector args_vec = {args...}; + std::string args_string = std::accumulate( + args_vec.begin() + 1, args_vec.end(), args_vec.front(), + [](std::string s0, std::string& s1) { + return s0 += "/" + s1; + }); + request_path += args_string; + } + + std::string url = request_format + request_path + "?" + m_api_key; + + openml_curl_request_helper(url); + + return m_curl_response_buffer; + } + + void post(const std::string& request, const std::string& data); + + private: + + std::string m_curl_response_buffer; + + /** + * Initialises CURL session and gets the data. + * This function also handles the response code from the server. + * + * @param url the url to query + */ + void openml_curl_request_helper(const std::string& url); + + /** + * Handles all possible codes + * + * @param code the code returned by the query + */ + void openml_curl_error_helper(CURLcode code); + + std::string m_api_key; + + static const char* xml_server; + static const char* json_server; + + static const std::unordered_map + m_format_options; + static const std::unordered_map + m_request_options; + + /* DATA API */ + static const char* dataset_description; + static const char* list_data_qualities; + static const char* data_features; + static const char* list_dataset_qualities; + static const char* list_dataset_filter; + + /* FLOW API */ + static const char* flow_file; + }; + + class OpenMLFlow + { + + public: + explicit OpenMLFlow( + const std::string& api_key, const std::string& flow_id) + : m_api_key(api_key), m_flow_id(flow_id){}; + + void download_flow(); + + static void upload_flow(const OpenMLFlow& flow); + + private: + std::string m_api_key; + std::string m_flow_id; + }; +} // namespace shogun +#endif // HAVE_CURL + +#endif // SHOGUN_OPENMLFLOW_H From 7cf1d102021108022324cfb91031772283b571df Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 7 May 2019 14:09:35 +0100 Subject: [PATCH 02/32] basic flow interface working with python --- cmake/ShogunUtils.cmake | 81 ++-- cmake/external/RapidJSON.cmake | 16 + src/interfaces/swig/IO.i | 2 + src/interfaces/swig/IO_includes.i | 1 + src/interfaces/swig/SGBase.i | 1 + src/shogun/CMakeLists.txt | 407 ++++++++++--------- src/shogun/io/OpenMLFlow.cpp | 187 +++++++++ src/shogun/io/{OpenmlFlow.h => OpenMLFlow.h} | 92 ++++- src/shogun/io/OpenmlFlow.cpp | 100 ----- 9 files changed, 538 insertions(+), 349 deletions(-) create mode 100644 cmake/external/RapidJSON.cmake create mode 100644 src/shogun/io/OpenMLFlow.cpp rename src/shogun/io/{OpenmlFlow.h => OpenMLFlow.h} (59%) delete mode 100644 src/shogun/io/OpenmlFlow.cpp diff --git a/cmake/ShogunUtils.cmake b/cmake/ShogunUtils.cmake index 4fcfb533c53..eca6cdad079 100644 --- a/cmake/ShogunUtils.cmake +++ b/cmake/ShogunUtils.cmake @@ -75,31 +75,31 @@ ENDMACRO() # based on compiz_discover_tests function (shogun_discover_tests EXECUTABLE) - add_dependencies (${EXECUTABLE} discover_gtest_tests) - - add_custom_command (TARGET ${EXECUTABLE} - POST_BUILD - COMMAND ${CMAKE_COMMAND} -D UNIT_TEST_CMD=${CMAKE_BINARY_DIR}/bin/${EXECUTABLE} - -D DISCOVER_CMD=${CMAKE_BINARY_DIR}/bin/discover_gtest_tests - -D WORKING_DIR=${CMAKE_CURRENT_BINARY_DIR} - -P ${CMAKE_MODULE_PATH}/discover_unit_tests.cmake - COMMENT "Discovering Tests in ${EXECUTABLE}" - DEPENDS - VERBATIM) + add_dependencies (${EXECUTABLE} discover_gtest_tests) + + add_custom_command (TARGET ${EXECUTABLE} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -D UNIT_TEST_CMD=${CMAKE_BINARY_DIR}/bin/${EXECUTABLE} + -D DISCOVER_CMD=${CMAKE_BINARY_DIR}/bin/discover_gtest_tests + -D WORKING_DIR=${CMAKE_CURRENT_BINARY_DIR} + -P ${CMAKE_MODULE_PATH}/discover_unit_tests.cmake + COMMENT "Discovering Tests in ${EXECUTABLE}" + DEPENDS + VERBATIM) endfunction () MACRO(AddMetaIntegrationTest META_TARGET CONDITION) - IF (${CONDITION}) - add_test(NAME integration_meta_${META_TARGET}-${NAME_WITH_DIR} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMAND meta_example_integration_tester ${REL_DIR} ${NAME}.dat ${META_TARGET} generated_results reference_results) - set_tests_properties( - integration_meta_${META_TARGET}-${NAME_WITH_DIR} - PROPERTIES - LABELS "integration" - DEPENDS generated_${META_TARGET}-${NAME_WITH_DIR} - ) - ENDIF() + IF (${CONDITION}) + add_test(NAME integration_meta_${META_TARGET}-${NAME_WITH_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMAND meta_example_integration_tester ${REL_DIR} ${NAME}.dat ${META_TARGET} generated_results reference_results) + set_tests_properties( + integration_meta_${META_TARGET}-${NAME_WITH_DIR} + PROPERTIES + LABELS "integration" + DEPENDS generated_${META_TARGET}-${NAME_WITH_DIR} + ) + ENDIF() ENDMACRO() MACRO(AddLibShogunExample EXAMPLE_CPP) @@ -144,18 +144,23 @@ macro(ADD_LIBRARY_DEPENDENCY) set(oneValueArgs LIBRARY CONFIG_FLAG VERSION SCOPE) set(multiValueArgs TARGETS) cmake_parse_arguments(ADD_LIBRARY_DEPENDENCY "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - STRING(TOUPPER ${ADD_LIBRARY_DEPENDENCY_LIBRARY} LIBRARY_PREFIX) - OPTION(ENABLE_${LIBRARY_PREFIX} "Use ${LIBRARY_PREFIX}" ON) + SET(LIBRARY_PREFIX ${ADD_LIBRARY_DEPENDENCY_LIBRARY}) + STRING(TOUPPER ${ADD_LIBRARY_DEPENDENCY_LIBRARY} LIBRARY_PREFIX_UPPER) + OPTION(ENABLE_${LIBRARY_PREFIX_UPPER} "Use ${LIBRARY_PREFIX}" ON) if (${ADD_LIBRARY_DEPENDENCY_REQUIRED}) find_package(${ADD_LIBRARY_DEPENDENCY_LIBRARY} REQUIRED ${ADD_LIBRARY_DEPENDENCY_VERSION}) else() find_package(${ADD_LIBRARY_DEPENDENCY_LIBRARY} ${ADD_LIBRARY_DEPENDENCY_VERSION}) endif() - if (${LIBRARY_PREFIX}_FOUND AND ENABLE_${LIBRARY_PREFIX}) + if ((${LIBRARY_PREFIX}_FOUND OR ${LIBRARY_PREFIX_UPPER}_FOUND) AND ENABLE_${LIBRARY_PREFIX_UPPER}) if (${LIBRARY_PREFIX}_INCLUDE_DIR) set(LIBRARY_HEADER ${${LIBRARY_PREFIX}_INCLUDE_DIR}) + elseif (${LIBRARY_PREFIX_UPPER}_INCLUDE_DIR) + set(LIBRARY_HEADER ${${LIBRARY_PREFIX_UPPER}_INCLUDE_DIR}) elseif (${LIBRARY_PREFIX}_INCLUDE_DIRS) set(LIBRARY_HEADER ${${LIBRARY_PREFIX}_INCLUDE_DIRS}) + elseif (${LIBRARY_PREFIX_UPPER}_INCLUDE_DIRS) + set(LIBRARY_HEADER ${${LIBRARY_PREFIX_UPPER}_INCLUDE_DIRS}) else () message(${${LIBRARY_PREFIX}_INCLUDE_DIR}) message(FATAL_ERROR "Found ${ADD_LIBRARY_DEPENDENCY_LIBRARY}, but not it's headers!") @@ -163,15 +168,23 @@ macro(ADD_LIBRARY_DEPENDENCY) set(${ADD_LIBRARY_DEPENDENCY_CONFIG_FLAG} ON CACHE BOOL "Use ${LIBRARY_PREFIX}" FORCE) + if (${LIBRARY_PREFIX}_LIBRARIES) + set(LIBRARY_LIBS ${${LIBRARY_PREFIX}_LIBRARIES}) + elseif(${LIBRARY_PREFIX_UPPER}_LIBRARIES) + set(LIBRARY_LIBS ${${LIBRARY_PREFIX_UPPER}_LIBRARIES}) + else() + message(FATAL_ERROR "Found ${ADD_LIBRARY_DEPENDENCY_LIBRARY}, but not it's libraries!") + endif() + ForEach (element ${ADD_LIBRARY_DEPENDENCY_TARGETS}) if (TARGET ${element}) get_target_property(TARGET_TYPE ${element} TYPE) if (${TARGET_TYPE} STREQUAL INTERFACE_LIBRARY) target_include_directories(${element} INTERFACE ${LIBRARY_HEADER}) - target_link_libraries(${element} INTERFACE ${${LIBRARY_PREFIX}_LIBRARIES}) + target_link_libraries(${element} INTERFACE ${LIBRARY_LIBS}) else() if (NOT ${TARGET_TYPE} STREQUAL OBJECT_LIBRARY) - target_link_libraries(${element} ${ADD_LIBRARY_DEPENDENCY_SCOPE} ${${LIBRARY_PREFIX}_LIBRARIES}) + target_link_libraries(${element} ${ADD_LIBRARY_DEPENDENCY_SCOPE} ${LIBRARY_LIBS}) endif() target_include_directories(${element} ${ADD_LIBRARY_DEPENDENCY_SCOPE} ${LIBRARY_HEADER}) endif() @@ -225,12 +238,12 @@ endfunction() function(SET_LINALG_BACKEND COMPONENT FLAG) OPTION(USE_EIGEN3_${FLAG} "Use ${COMPONENT} Eigen3" ON) CMAKE_DEPENDENT_OPTION( - USE_VIENNACL_${FLAG} "Use ${COMPONENT} ViennaCL" OFF - "VIENNACL_FOUND;USE_VIENNACL;NOT USE_EIGEN3_${FLAG}" ON) + USE_VIENNACL_${FLAG} "Use ${COMPONENT} ViennaCL" OFF + "VIENNACL_FOUND;USE_VIENNACL;NOT USE_EIGEN3_${FLAG}" ON) if(NOT ${COMPONENT}) - set(${COMPONENT} EIGEN3 CACHE STRING - "Set linear algebra backend ${COMPONENT}: EIGEN3, VIENNACL" - FORCE) + set(${COMPONENT} EIGEN3 CACHE STRING + "Set linear algebra backend ${COMPONENT}: EIGEN3, VIENNACL" + FORCE) endif() if (${COMPONENT} STREQUAL "EIGEN3") @@ -264,7 +277,7 @@ function(GET_META_EXAMPLE_VARS META_EXAMPLE EX_NAME REL_DIR NAME_WITH_DIR) endfunction() function(GET_INTERFACE_VARS INTERFACE DIRECTORY EXTENSION) - string(REGEX MATCH "INTERFACE_([a-zA-Z]+)" _dir ${INTERFACE}) + string(REGEX MATCH "INTERFACE_([a-zA-Z]+)" _dir ${INTERFACE}) STRING(TOLOWER "${CMAKE_MATCH_1}" _dir) SET(${DIRECTORY} ${_dir} PARENT_SCOPE) @@ -314,4 +327,4 @@ function(ADD_SHOGUN_BENCHMARK REL_BENCHMARK_NAME) if(ARGN) set_tests_properties(${BENCHMARK_NAME} PROPERTIES ${ARGN}) endif() -endfunction() +endfunction() \ No newline at end of file diff --git a/cmake/external/RapidJSON.cmake b/cmake/external/RapidJSON.cmake new file mode 100644 index 00000000000..0a6efe738c4 --- /dev/null +++ b/cmake/external/RapidJSON.cmake @@ -0,0 +1,16 @@ +set(RAPIDJSON_PREFIX ${CMAKE_BINARY_DIR}/RapidJSON) +set(RAPIDJSON_INCLUDE_DIR "${RAPIDJSON_PREFIX}/src/RapidJSON/include") +include(ExternalProject) +ExternalProject_Add( + RapidJSON + PREFIX ${RAPIDJSON_PREFIX} + DOWNLOAD_DIR ${THIRD_PARTY_DIR}/RapidJSON + URL https://github.com/Tencent/rapidjson/archive/v1.1.0.tar.gz + URL_MD5 badd12c511e081fec6c89c43a7027bce + CMAKE_ARGS -DRAPIDJSON_BUILD_DOC:BOOL=OFF + -DRAPIDJSON_BUILD_EXAMPLES:BOOL=OFF + -DRAPIDJSON_BUILD_TESTS:BOOL=OFF + INSTALL_COMMAND "" +) + +LIST(APPEND SHOGUN_DEPENDS RapidJSON) \ No newline at end of file diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index 75c0e10899e..4e59008adf4 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -25,6 +25,7 @@ %rename(SimpleFile) CSimpleFile; %rename(MemoryMappedFile) CMemoryMappedFile; +%shared_ptr(shogun::OpenMLFlow) %include %include @@ -120,6 +121,7 @@ namespace shogun %include %include +%include %include %include %include diff --git a/src/interfaces/swig/IO_includes.i b/src/interfaces/swig/IO_includes.i index 7bbdb373ccb..89401074d62 100644 --- a/src/interfaces/swig/IO_includes.i +++ b/src/interfaces/swig/IO_includes.i @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/src/interfaces/swig/SGBase.i b/src/interfaces/swig/SGBase.i index 27f0d067dd6..de7f79a772b 100644 --- a/src/interfaces/swig/SGBase.i +++ b/src/interfaces/swig/SGBase.i @@ -8,6 +8,7 @@ %include "stdint.i" %include "std_string.i" %include "exception.i" +%include "std_shared_ptr.i" %feature("ref") shogun::CSGObject "SG_REF($this);" %feature("unref") shogun::CSGObject "SG_UNREF($this);" diff --git a/src/shogun/CMakeLists.txt b/src/shogun/CMakeLists.txt index 1698bc05e9f..8290d3a49b4 100644 --- a/src/shogun/CMakeLists.txt +++ b/src/shogun/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES( - ${CMAKE_SOURCE_DIR}/src - ${CMAKE_BINARY_DIR}/src + ${CMAKE_SOURCE_DIR}/src + ${CMAKE_BINARY_DIR}/src ) include(GenerateExportHeader) @@ -34,33 +34,33 @@ set(SHOGUN_CLING_LIBRARY_DIR "\"${CMAKE_INSTALL_PREFIX}/${SHOGUN_LIB_INSTALL}\"" if (MSVC OR BUILD_BENCHMARKS) SET(LIBSHOGUN_BUILD_STATIC ON - CACHE BOOL "Build libshogun static library" FORCE) + CACHE BOOL "Build libshogun static library" FORCE) endif() # Allow to hide non-bsd compatible codes OPTION(LICENSE_GPL_SHOGUN "Include GPL codes of Shogun (non-BSD compatible) in build" ON) SET(USE_GPL_SHOGUN 0) IF (LICENSE_GPL_SHOGUN) - SET(SHOGUN_GPL_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/src/gpl) - IF(EXISTS "${SHOGUN_GPL_INCLUDE_DIR}/shogun") - SET(USE_GPL_SHOGUN 1) + SET(SHOGUN_GPL_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/src/gpl) + IF(EXISTS "${SHOGUN_GPL_INCLUDE_DIR}/shogun") + SET(USE_GPL_SHOGUN 1) - FILE(GLOB_RECURSE GPL_LIBSHOGUN_SRC ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_CPP} ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_C}) - FILE(GLOB_RECURSE GPL_LIBSHOGUN_HEADERS ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_HEADER}) + FILE(GLOB_RECURSE GPL_LIBSHOGUN_SRC ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_CPP} ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_C}) + FILE(GLOB_RECURSE GPL_LIBSHOGUN_HEADERS ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_HEADER}) - LIST(APPEND LIBSHOGUN_SRC ${GPL_LIBSHOGUN_SRC}) - LIST(APPEND LIBSHOGUN_HEADERS ${GPL_LIBSHOGUN_HEADERS}) - INCLUDE_DIRECTORIES(${SHOGUN_GPL_INCLUDE_DIR}) + LIST(APPEND LIBSHOGUN_SRC ${GPL_LIBSHOGUN_SRC}) + LIST(APPEND LIBSHOGUN_HEADERS ${GPL_LIBSHOGUN_HEADERS}) + INCLUDE_DIRECTORIES(${SHOGUN_GPL_INCLUDE_DIR}) - SET(USE_GPL_SHOGUN 1) - ELSE() - MESSAGE(FATAL_ERROR "Shogun can only be built with GPL codes if the source files are in ${SHOGUN_GPL_INCLUDE_DIR}. Please download or disable with LICENSE_GPL_SHOGUN=OFF.\n") - ENDIF() + SET(USE_GPL_SHOGUN 1) + ELSE() + MESSAGE(FATAL_ERROR "Shogun can only be built with GPL codes if the source files are in ${SHOGUN_GPL_INCLUDE_DIR}. Please download or disable with LICENSE_GPL_SHOGUN=OFF.\n") + ENDIF() ENDIF() OPTION(USE_SVMLIGHT "SVMLight" ON) IF(USE_SVMLIGHT AND NOT USE_GPL_SHOGUN) - MESSAGE(FATAL_ERROR "Can only use SVMLight when GPL codes are included. Disable by USE_SVMLIGHT=OFF") + MESSAGE(FATAL_ERROR "Can only use SVMLight when GPL codes are included. Disable by USE_SVMLIGHT=OFF") ENDIF() # add target to compile the libshogun sources @@ -72,13 +72,13 @@ ENDIF() # Generate versionstring.h ADD_CUSTOM_TARGET( - version - ${CMAKE_COMMAND} - -D SRC=${CMAKE_CURRENT_SOURCE_DIR}/lib/versionstring.h.in - -D DST=${CMAKE_CURRENT_BINARY_DIR}/lib/versionstring.h - -D ROOT_DIR=${CMAKE_SOURCE_DIR} - -P ${CMAKE_SOURCE_DIR}/cmake/version.cmake - COMMENT "Generating version header" + version + ${CMAKE_COMMAND} + -D SRC=${CMAKE_CURRENT_SOURCE_DIR}/lib/versionstring.h.in + -D DST=${CMAKE_CURRENT_BINARY_DIR}/lib/versionstring.h + -D ROOT_DIR=${CMAKE_SOURCE_DIR} + -P ${CMAKE_SOURCE_DIR}/cmake/version.cmake + COMMENT "Generating version header" ) add_dependencies(libshogun version) @@ -97,12 +97,12 @@ FOREACH(template ${LIBSHOGUN_SRC_TMP}) ENDFOREACH() ADD_CUSTOM_COMMAND(OUTPUT ${generated_cpp} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${template} ${generated_cpp} - COMMAND ${PYTHON_EXECUTABLE} ${generator_script} ${generated_cpp} -in ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt - DEPENDS ${template} ${generator_script} version ${LIBSHOGUN_HEADERS} ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMENT "Generating ${generated_cpp}" - ) + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${template} ${generated_cpp} + COMMAND ${PYTHON_EXECUTABLE} ${generator_script} ${generated_cpp} -in ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt + DEPENDS ${template} ${generator_script} version ${LIBSHOGUN_HEADERS} ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Generating ${generated_cpp}" + ) ADD_CUSTOM_TARGET(${generated_target} DEPENDS ${generated_cpp}) add_dependencies(libshogun ${generated_target}) @@ -113,18 +113,18 @@ ENDFOREACH() add_library(shogun SHARED $ ${CMAKE_CURRENT_BINARY_DIR}/lib/config.h) generate_export_header(shogun) set_target_properties( - shogun PROPERTIES - VERSION ${LIBSHOGUNVER} - SOVERSION ${LIBSHOGUNSO} - INTERFACE_shogun_MAJOR_VERSION ${SHOGUN_VERSION_MAJOR}) + shogun PROPERTIES + VERSION ${LIBSHOGUNVER} + SOVERSION ${LIBSHOGUNSO} + INTERFACE_shogun_MAJOR_VERSION ${SHOGUN_VERSION_MAJOR}) set_property(TARGET shogun APPEND PROPERTY COMPATIBLE_INTERFACE_STRING ${SHOGUN_VERSION_MAJOR}) target_include_directories(shogun PUBLIC - $ - $ - $ -) + $ + $ + $ + ) IF (${USE_GPL_SHOGUN}) - target_include_directories(shogun PUBLIC $) + target_include_directories(shogun PUBLIC $) ENDIF() get_property(dirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) @@ -137,33 +137,28 @@ endforeach() add_library(shogun_deps INTERFACE) target_link_libraries(shogun_deps INTERFACE shogun) target_include_directories(shogun_deps INTERFACE - $ - $ -) + $ + $ + ) IF (${USE_GPL_SHOGUN}) - target_include_directories(shogun_deps INTERFACE $) + target_include_directories(shogun_deps INTERFACE $) ENDIF() -# add bundled libraries to dependency -if(SHOGUN_DEPENDS) - add_dependencies(libshogun ${SHOGUN_DEPENDS}) -endif() - # add target for static library if enabled if (LIBSHOGUN_BUILD_STATIC) add_library(shogun-static STATIC $ ${CMAKE_CURRENT_BINARY_DIR}/lib/config.h) set_property(TARGET shogun-static PROPERTY OUTPUT_NAME shogun) target_include_directories(shogun-static PUBLIC - $ - $ - $ - ) + $ + $ + $ + ) IF (${USE_GPL_SHOGUN}) target_include_directories(shogun-static PUBLIC $) ENDIF() if(MSVC) - target_link_libraries(shogun-static PUBLIC winmm.lib) + target_link_libraries(shogun-static PUBLIC winmm.lib Shlwapi.lib) endif() ENDIF() @@ -171,20 +166,13 @@ IF(MSVC) IF(MSVC_VERSION VERSION_LESS "1700") include(external/MSIntTypes) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC - $ - $ - ) + $ + $ + ) ENDIF() - # bundle dirent - include(external/MSDirent) - SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC - $ - $ - ) - - target_link_libraries(shogun PUBLIC winmm) - target_link_libraries(shogun_deps INTERFACE winmm) + target_link_libraries(shogun PUBLIC winmm Shlwapi) + target_link_libraries(shogun_deps INTERFACE winmm Shlwapi) ENDIF() ########################### compiler capabilities @@ -258,7 +246,7 @@ if (NOT MSVC) SET(USE_HOARD 1) SET(POSTLINKFLAGS ${POSTLINKFLAGS} ${Hoard_LIBRARIES}) else () - message(FATAL_ERROR "Unable to use hoard malloc: library not found") + message(FATAL_ERROR "Unable to use hoard malloc: library not found") endif () else() message(STATUS "Using system's malloc") @@ -291,49 +279,49 @@ endif() include(external/StanMath) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) ####### LINALG # ViennaCL detection SHOGUN_DEPENDENCIES( - LIBRARY ViennaCL - SCOPE PRIVATE - VERSION ${VIENNACL_VERSION_MINIMUM} - CONFIG_FLAG HAVE_VIENNACL) + LIBRARY ViennaCL + SCOPE PRIVATE + VERSION ${VIENNACL_VERSION_MINIMUM} + CONFIG_FLAG HAVE_VIENNACL) ####### /LINALG FIND_PACKAGE(rxcpp) IF(NOT rxcpp_FOUND) - include(external/rxcpp) - SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ - ) + include(external/rxcpp) + SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM + $ + $ + ) ELSE() - SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM ${rxcpp_INCLUDE_DIR}) + SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM ${rxcpp_INCLUDE_DIR}) ENDIF() # TFLogger package FIND_PACKAGE(TFLogger 0.1.0 CONFIG) IF (TFLogger_FOUND) - SET(HAVE_TFLOGGER 1) - SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE SYSTEM ${TFLogger_INCLUDE_DIR}) - target_link_libraries(shogun PRIVATE tflogger::tflogger) + SET(HAVE_TFLOGGER 1) + SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE SYSTEM ${TFLogger_INCLUDE_DIR}) + target_link_libraries(shogun PRIVATE tflogger::tflogger) ENDIF() #### LAPACK @@ -344,35 +332,40 @@ CHECK_CXX_SOURCE_COMPILES("#include \n int main(int argc, char** argv) IF (NOT HAVE_STD_VARIANT) include(external/variant) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ - ) + $ + $ + ) ENDIF() SHOGUN_DEPENDENCIES( - LIBRARY GLPK - SCOPE PRIVATE - CONFIG_FLAG USE_GLPK) + LIBRARY GLPK + SCOPE PRIVATE + CONFIG_FLAG USE_GLPK) SHOGUN_DEPENDENCIES( - LIBRARY CPLEX - SCOPE PRIVATE - CONFIG_FLAG USE_CPLEX) + LIBRARY LibArchive + SCOPE PRIVATE + CONFIG_FLAG HAVE_LIBARCHIVE) SHOGUN_DEPENDENCIES( - LIBRARY ARPACK - SCOPE PRIVATE - CONFIG_FLAG HAVE_ARPACK) + LIBRARY CPLEX + SCOPE PRIVATE + CONFIG_FLAG USE_CPLEX) SHOGUN_DEPENDENCIES( - LIBRARY Mosek - SCOPE PRIVATE - CONFIG_FLAG USE_MOSEK) + LIBRARY ARPACK + SCOPE PRIVATE + CONFIG_FLAG HAVE_ARPACK) SHOGUN_DEPENDENCIES( - LIBRARY Protobuf - SCOPE PUBLIC - CONFIG_FLAG HAVE_PROTOBUF) + LIBRARY Mosek + SCOPE PRIVATE + CONFIG_FLAG USE_MOSEK) + +SHOGUN_DEPENDENCIES( + LIBRARY Protobuf + SCOPE PUBLIC + CONFIG_FLAG HAVE_PROTOBUF) IF (PROTOBUF_FOUND AND ENABLE_PROTOBUF) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/io/protobuf) @@ -383,12 +376,12 @@ IF (PROTOBUF_FOUND AND ENABLE_PROTOBUF) SET(name "${CMAKE_CURRENT_BINARY_DIR}/io/protobuf/${FIL_WE}") ADD_CUSTOM_COMMAND( - OUTPUT "${name}.pb.cc" "${name}.pb.h" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${FIL} - --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/io/protobuf - --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/io/protobuf - COMMENT "Running C++ protocol buffer compiler on ${FIL_WE}.proto" - ) + OUTPUT "${name}.pb.cc" "${name}.pb.h" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${FIL} + --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/io/protobuf + --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/io/protobuf + COMMENT "Running C++ protocol buffer compiler on ${FIL_WE}.proto" + ) ADD_CUSTOM_TARGET("${FIL_WE}Protobuf" DEPENDS "${name}.pb.cc") add_dependencies(libshogun "${FIL_WE}Protobuf") set_source_files_properties(${name}.pb.cc PROPERTIES GENERATED ON) @@ -407,55 +400,58 @@ IF(BUNDLE_JSON) SET(POSTLINKFLAGS ${POSTLINKFLAGS} ${JSON_LDFLAGS}) ELSE() SHOGUN_DEPENDENCIES( - LIBRARY JSON - SCOPE PUBLIC - CONFIG_FLAG HAVE_JSON - VERSION 0.11) + LIBRARY JSON + SCOPE PUBLIC + CONFIG_FLAG HAVE_JSON + VERSION 0.11) ENDIF() SHOGUN_DEPENDENCIES( - LIBRARY LibXml2 - SCOPE PUBLIC - CONFIG_FLAG HAVE_XML) + LIBRARY LibXml2 + SCOPE PUBLIC + CONFIG_FLAG HAVE_XML) +# RapidJSON +include(external/RapidJSON) +SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC ${RAPIDJSON_INCLUDE_DIR}) if (NOT WIN32) # FIXME: HDF5 linking on WIN32 is broken. # at least with the hdf5 supplied in anaconda SHOGUN_DEPENDENCIES( - LIBRARY HDF5 - SCOPE PUBLIC - CONFIG_FLAG HAVE_HDF5) + LIBRARY HDF5 + SCOPE PUBLIC + CONFIG_FLAG HAVE_HDF5) endif () SHOGUN_DEPENDENCIES( - LIBRARY CURL - SCOPE PRIVATE - CONFIG_FLAG HAVE_CURL) + LIBRARY CURL + SCOPE PRIVATE + CONFIG_FLAG HAVE_CURL) SHOGUN_DEPENDENCIES( - LIBRARY ZLIB - SCOPE PRIVATE - CONFIG_FLAG USE_GZIP) + LIBRARY ZLIB + SCOPE PRIVATE + CONFIG_FLAG USE_GZIP) SHOGUN_DEPENDENCIES( - LIBRARY BZip2 - SCOPE PRIVATE - CONFIG_FLAG USE_BZIP2) + LIBRARY BZip2 + SCOPE PRIVATE + CONFIG_FLAG USE_BZIP2) SHOGUN_DEPENDENCIES( - LIBRARY LibLZMA - SCOPE PRIVATE - CONFIG_FLAG USE_LZMA) + LIBRARY LibLZMA + SCOPE PRIVATE + CONFIG_FLAG USE_LZMA) SHOGUN_DEPENDENCIES( - LIBRARY SNAPPY - SCOPE PRIVATE - CONFIG_FLAG USE_SNAPPY) + LIBRARY SNAPPY + SCOPE PRIVATE + CONFIG_FLAG USE_SNAPPY) SHOGUN_DEPENDENCIES( - LIBRARY LZO - SCOPE PRIVATE - CONFIG_FLAG USE_LZO) + LIBRARY LZO + SCOPE PRIVATE + CONFIG_FLAG USE_LZO) #integration OPTION(OpenCV "OpenCV Integration" OFF) @@ -483,25 +479,25 @@ IF(BUNDLE_NLOPT) SET(POSTLINKFLAGS ${POSTLINKFLAGS} ${NLOPT_LIBRARIES}) ELSE() SHOGUN_DEPENDENCIES( - LIBRARY NLopt - SCOPE PRIVATE - CONFIG_FLAG HAVE_NLOPT) + LIBRARY NLopt + SCOPE PRIVATE + CONFIG_FLAG HAVE_NLOPT) ENDIF() SHOGUN_DEPENDENCIES( - LIBRARY LpSolve - SCOPE PRIVATE - CONFIG_FLAG USE_LPSOLVE) + LIBRARY LpSolve + SCOPE PRIVATE + CONFIG_FLAG USE_LPSOLVE) SHOGUN_DEPENDENCIES( - LIBRARY ColPack - SCOPE PRIVATE - CONFIG_FLAG HAVE_COLPACK) + LIBRARY ColPack + SCOPE PRIVATE + CONFIG_FLAG HAVE_COLPACK) SHOGUN_DEPENDENCIES( - LIBRARY ARPREC - SCOPE PRIVATE - CONFIG_FLAG HAVE_ARPREC) + LIBRARY ARPREC + SCOPE PRIVATE + CONFIG_FLAG HAVE_ARPREC) ###### checks for random # PRNG settings @@ -511,12 +507,12 @@ SET(DSFMT_MEXP ${MEXP}) CHECK_FUNCTION_EXISTS(arc4random HAVE_ARC4RANDOM) IF(NOT HAVE_ARC4RANDOM) - # assume that /dev/random is non-blocking if /dev/urandom does not exist - if(EXISTS /dev/urandom) - set(DEV_RANDOM "/dev/urandom" CACHE INTERNAL "" FORCE) - elseif( EXISTS /dev/random ) - set(DEV_RANDOM "/dev/random" CACHE INTERNAL "" FORCE) - endif() + # assume that /dev/random is non-blocking if /dev/urandom does not exist + if(EXISTS /dev/urandom) + set(DEV_RANDOM "/dev/urandom" CACHE INTERNAL "" FORCE) + elseif( EXISTS /dev/random ) + set(DEV_RANDOM "/dev/random" CACHE INTERNAL "" FORCE) + endif() ENDIF() ############################ HMM @@ -535,11 +531,16 @@ OPTION(USE_LOGSUMARRAY "Use sum array, supposed to be a bit more accurate" OFF) #kernelcache to use 4-byte-floating-point values instead of 8-byte-doubles OPTION(USE_SHORTREAL_KERNELCACHE "Kernelcache to use 4-byte-floating-point values instead of 8-byte-doubles" ON) +# add bundled libraries to dependency +if(SHOGUN_DEPENDS) + add_dependencies(libshogun ${SHOGUN_DEPENDS}) +endif() + OPTION(USE_LOGCACHE "Use (1+exp(x)) log cache (is much faster but less accurate)" OFF) ################## linker optimisations OPTION(INCREMENTAL_LINKING "Enable incremantal linking") SET(INCREMENTAL_LINKING_DIR ${CMAKE_BINARY_DIR}/linker_cache - CACHE PATH "Incremental linking cache directory") + CACHE PATH "Incremental linking cache directory") OPTION(ENABLE_LTO "Enable link-time optimisation") include(LDGOLD) @@ -561,7 +562,7 @@ target_link_libraries(shogun PUBLIC ${POSTLINKFLAGS}) IF (BUILD_BENCHMARKS) IF (NOT BUILD_TYPE_UC STREQUAL "RELEASE") MESSAGE(WARNING "You are building benchmark against non-Release mode!" - "The results might not reflect the actual runtimes...") + "The results might not reflect the actual runtimes...") ENDIF() find_package(benchmark CONFIG REQUIRED) @@ -571,16 +572,16 @@ IF (BUILD_BENCHMARKS) target_link_libraries(shogun_benchmark_main benchmark::benchmark shogun-static) elseif(MSVC) target_link_libraries(shogun_benchmark_main - benchmark::benchmark - Shlwapi.lib - shogun-static - ) + benchmark::benchmark + Shlwapi.lib + shogun-static + ) else() target_link_libraries(shogun_benchmark_main - benchmark::benchmark - pthread - shogun-static - ) + benchmark::benchmark + pthread + shogun-static + ) endif() set(SHOGUN_BENCHMARK_LINK_LIBS shogun_benchmark_main) @@ -602,15 +603,15 @@ IF (LICENSE_GPL_SHOGUN) ENDIF() INSTALL( - DIRECTORY ${INCLUDE_HEADERS_DIR_LIST} - DESTINATION ${INCLUDE_INSTALL_DIR} - COMPONENT headers - FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" - PATTERN "CMakeFiles*" EXCLUDE - PATTERN "*.dir" EXCLUDE - PATTERN "*Debug*" EXCLUDE - PATTERN "*Release*" EXCLUDE - PATTERN ".settings" EXCLUDE) + DIRECTORY ${INCLUDE_HEADERS_DIR_LIST} + DESTINATION ${INCLUDE_INSTALL_DIR} + COMPONENT headers + FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" + PATTERN "CMakeFiles*" EXCLUDE + PATTERN "*.dir" EXCLUDE + PATTERN "*Debug*" EXCLUDE + PATTERN "*Release*" EXCLUDE + PATTERN ".settings" EXCLUDE) # set the desidered targets to be installed set(INSTALL_TARGETS shogun) @@ -619,41 +620,41 @@ if (LIBSHOGUN_BUILD_STATIC) endif() INSTALL( - TARGETS ${INSTALL_TARGETS} - EXPORT ShogunTargets - ARCHIVE DESTINATION ${SHOGUN_LIB_INSTALL} - RUNTIME DESTINATION ${SHOGUN_LIB_INSTALL} - LIBRARY DESTINATION ${SHOGUN_LIB_INSTALL} - INCLUDES DESTINATION ${INCLUDE_INSTALL_DIR} + TARGETS ${INSTALL_TARGETS} + EXPORT ShogunTargets + ARCHIVE DESTINATION ${SHOGUN_LIB_INSTALL} + RUNTIME DESTINATION ${SHOGUN_LIB_INSTALL} + LIBRARY DESTINATION ${SHOGUN_LIB_INSTALL} + INCLUDES DESTINATION ${INCLUDE_INSTALL_DIR} ) file(TO_CMAKE_PATH ${SHOGUN_LIB_INSTALL}/cmake/shogun CONFIG_PACKAGE_DIR) configure_package_config_file( - ${CMAKE_SOURCE_DIR}/cmake/ShogunConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake - INSTALL_DESTINATION ${SHOGUN_LIB_INSTALL}/cmake/shogun - PATH_VARS INCLUDE_INSTALL_DIR CONFIG_PACKAGE_DIR) + ${CMAKE_SOURCE_DIR}/cmake/ShogunConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake + INSTALL_DESTINATION ${SHOGUN_LIB_INSTALL}/cmake/shogun + PATH_VARS INCLUDE_INSTALL_DIR CONFIG_PACKAGE_DIR) write_basic_package_version_file( - "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" - VERSION ${VERSION} - COMPATIBILITY SameMajorVersion + "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" + VERSION ${VERSION} + COMPATIBILITY SameMajorVersion ) export(EXPORT ShogunTargets - FILE "${CMAKE_CURRENT_BINARY_DIR}/ShogunTargets.cmake" - NAMESPACE shogun:: -) + FILE "${CMAKE_CURRENT_BINARY_DIR}/ShogunTargets.cmake" + NAMESPACE shogun:: + ) install( - EXPORT ShogunTargets - FILE ShogunTargets.cmake - NAMESPACE shogun:: - DESTINATION ${CONFIG_PACKAGE_DIR} + EXPORT ShogunTargets + FILE ShogunTargets.cmake + NAMESPACE shogun:: + DESTINATION ${CONFIG_PACKAGE_DIR} ) install( - FILES - "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake" - "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" - DESTINATION ${CONFIG_PACKAGE_DIR} -) + FILES + "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" + DESTINATION ${CONFIG_PACKAGE_DIR} +) \ No newline at end of file diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp new file mode 100644 index 00000000000..505869d6789 --- /dev/null +++ b/src/shogun/io/OpenMLFlow.cpp @@ -0,0 +1,187 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#ifdef HAVE_CURL + +using namespace shogun; +using namespace rapidjson; + +size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) +{ + // adapted from https://stackoverflow.com/a/5780603 + // Is there anything in the buffer? + if (buffer_in->empty()) + { + // Append the data to the buffer + buffer_in->append(data, size * nmemb); + + return size * nmemb; + } + + return 0; +} + +const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; +const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::dataset_description = "/data/{}"; +const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; +const char* OpenMLReader::data_features = "/data/features/{}"; +const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; +const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; +const char* OpenMLReader::flow_file = "/flow/{}"; + +const std::unordered_map + OpenMLReader::m_format_options = {{"xml", xml_server}, + {"json", json_server}}; +const std::unordered_map + OpenMLReader::m_request_options = { + {"dataset_description", dataset_description}, + {"list_data_qualities", list_data_qualities}, + {"data_features", data_features}, + {"list_dataset_qualities", list_dataset_qualities}, + {"list_dataset_filter", list_dataset_filter}, + {"flow_file", flow_file}}; + +OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) +{ +} + +void OpenMLReader::openml_curl_request_helper(const std::string& url) +{ + CURL* curl_handle = nullptr; + + curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.") + return; + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); + + CURLcode res = curl_easy_perform(curl_handle); + + openml_curl_error_helper(curl_handle, res); + + curl_easy_cleanup(curl_handle); +} + +void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) +{ + if (code != CURLE_OK) + { + // TODO: call curl_easy_cleanup(curl_handle) ? + SG_SERROR("Curl error: %s\n", curl_easy_strerror(code)) + } +// else +// { +// long response_code; +// curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &response_code); + // if (response_code == 200) + // return; + // else + // { + // if (response_code == 181) + // SG_SERROR("Unknown flow. The flow with the given ID was not + //found in the database.") else if (response_code == 180) SG_SERROR("") + // SG_SERROR("Server code: %d\n", response_code) + // } +// } +} + +std::shared_ptr +OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key) +{ + Document document; + parameters_type params; + components_type components; + std::string name; + std::string description; + std::string class_name; + + // get flow and parse with RapidJSON + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("flow_file", "json", flow_id); + document.Parse(return_string.c_str()); + check_flow_response(document); + + // store root for convenience. We know it exists from previous check. + const Value& root = document["flow"]; + + // handle parameters + if (root.HasMember("parameter")) + { + std::unordered_map param_dict; + + if (root["parameter"].IsArray()) + { + for (const auto &v : root["parameter"].GetArray()) { + emplace_string_to_map(v, param_dict, "data_type"); + emplace_string_to_map(v, param_dict, "default_value"); + emplace_string_to_map(v, param_dict, "description"); + params.emplace(v["name"].GetString(), param_dict); + param_dict.clear(); + } + } + else + { + // parameter can also be a dict, instead of array + const auto v = root["parameter"].GetObject(); + emplace_string_to_map(v, param_dict, "data_type"); + emplace_string_to_map(v, param_dict, "default_value"); + emplace_string_to_map(v, param_dict, "description"); + params.emplace(v["name"].GetString(), param_dict); + } + } + + // handle components, i.e. kernels + if (root.HasMember("component")) + { + for (const auto& v : root["component"].GetArray()) + { + components.emplace( + v["identifier"].GetString(), + OpenMLFlow::download_flow(v["flow"]["id"].GetString(), api_key)); + } + } + + // get remaining information from flow + if (root.HasMember("name")) + name = root["name"].GetString(); + if (root.HasMember("description")) + description = root["description"].GetString(); + if (root.HasMember("class_name")) + class_name = root["class_name"].GetString(); + + auto flow = std::make_shared(name, description, class_name, components, params); + + return flow; +} + +void OpenMLFlow::check_flow_response(Document& doc) +{ + if (SG_UNLIKELY(doc.HasMember("error"))) + { + const Value& root = doc["error"]; + SG_SERROR( + "Server error %s: %s\n", root["code"].GetString(), + root["message"].GetString()) + return; + } + REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); +} + +void OpenMLFlow::upload_flow(const std::shared_ptr& flow) +{ +} + +#endif // HAVE_CURL diff --git a/src/shogun/io/OpenmlFlow.h b/src/shogun/io/OpenMLFlow.h similarity index 59% rename from src/shogun/io/OpenmlFlow.h rename to src/shogun/io/OpenMLFlow.h index 6787df3e252..dee68423269 100644 --- a/src/shogun/io/OpenmlFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -7,16 +7,22 @@ #ifndef SHOGUN_OPENMLFLOW_H #define SHOGUN_OPENMLFLOW_H +#include + #ifdef HAVE_CURL +#include #include #include +#include + +#include +#include #include #include #include #include -#include namespace shogun { @@ -90,10 +96,7 @@ namespace shogun return m_curl_response_buffer; } - void post(const std::string& request, const std::string& data); - private: - std::string m_curl_response_buffer; /** @@ -107,9 +110,10 @@ namespace shogun /** * Handles all possible codes * + * @param curl_handle curl handle used in the request * @param code the code returned by the query */ - void openml_curl_error_helper(CURLcode code); + void openml_curl_error_helper(CURL* curl_handle, CURLcode code); std::string m_api_key; @@ -132,21 +136,85 @@ namespace shogun static const char* flow_file; }; + class OpenMLWritter + { + public: + OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; + + private: + std::string m_api_key; + }; + class OpenMLFlow { public: - explicit OpenMLFlow( - const std::string& api_key, const std::string& flow_id) - : m_api_key(api_key), m_flow_id(flow_id){}; + using components_type = + std::unordered_map>; + using parameters_type = std::unordered_map< + std::string, std::unordered_map>; + + OpenMLFlow( + const std::string& name, const std::string& description, + const std::string& model, components_type components, + parameters_type parameters) + : m_name(name), m_description(description), m_class_name(model), + m_parameters(parameters), m_components(components) + { + } + + ~OpenMLFlow()= default; - void download_flow(); + static std::shared_ptr + download_flow(const std::string& flow_id, const std::string& api_key); - static void upload_flow(const OpenMLFlow& flow); + static void upload_flow(const std::shared_ptr& flow); + + std::shared_ptr get_subflow(const std::string& name) + { + auto find_flow = m_components.find(name); + if (find_flow != m_components.end()) + return find_flow->second; + else + SG_SERROR( + "The provided subflow could not be found in this flow!") + return nullptr; + } private: - std::string m_api_key; - std::string m_flow_id; + std::string m_name; + std::string m_description; + std::string m_class_name; + parameters_type m_parameters; + components_type m_components; + +#ifndef SWIG + static void check_flow_response(rapidjson::Document& doc); + + static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericValue>& v, + std::unordered_map& param_dict, + const std::string& name) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); + } + + static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v, + std::unordered_map& param_dict, + const std::string& name) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); + } + +#endif // SWIG }; } // namespace shogun #endif // HAVE_CURL diff --git a/src/shogun/io/OpenmlFlow.cpp b/src/shogun/io/OpenmlFlow.cpp deleted file mode 100644 index f9cdb19af00..00000000000 --- a/src/shogun/io/OpenmlFlow.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Gil Hoben - */ - -#ifdef HAVE_CURL - -#include -#include "OpenmlFlow.h" - - -using namespace shogun; - -size_t writer(char *data, size_t size, size_t nmemb, std::string* buffer_in) -{ - // adapted from https://stackoverflow.com/a/5780603 - // Is there anything in the buffer? - if (buffer_in->empty()) - { - // Append the data to the buffer - buffer_in->append(data, size * nmemb); - - return size * nmemb; - } - - return 0; -} - -const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; -const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; -const char* OpenMLReader::dataset_description = "/data/{}"; -const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; -const char* OpenMLReader::data_features = "/data/features/{}"; -const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; -const char* OpenMLReader::flow_file = "/flow/{}"; - -const std::unordered_map - OpenMLReader::m_format_options = {{"xml", xml_server}, - {"json", json_server}}; -const std::unordered_map - OpenMLReader::m_request_options = { - - {"dataset_description", dataset_description}, - {"list_data_qualities", list_data_qualities}, - {"data_features", data_features}, - {"list_dataset_qualities", list_dataset_qualities}, - {"list_dataset_filter", list_dataset_filter}, - {"flow_file", flow_file}}; - -OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) -{ -} - -void OpenMLReader::post(const std::string& request, const std::string& data) -{ -} - -void OpenMLReader::openml_curl_request_helper(const std::string& url) -{ - CURL* curl_handle = nullptr; - - curl_handle = curl_easy_init(); - - if (!curl_handle) - { - SG_SERROR("Failed to initialise curl handle.") - return; - } - - curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl_handle, CURLOPT_HTTPGET,1); - curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); - curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); - - CURLcode res = curl_easy_perform(curl_handle); - - openml_curl_error_helper(res); - - curl_easy_cleanup(curl_handle); -} - -void OpenMLReader::openml_curl_error_helper(CURLcode code) { - -} - - -void OpenMLFlow::download_flow() -{ - - auto reader = OpenMLReader(m_api_key); - auto return_string = reader.get("flow_file", "json", m_flow_id); -} - -void OpenMLFlow::upload_flow(const OpenMLFlow& flow) -{ -} - -#endif // HAVE_CURL From b70398d08f8973f0a0889e57bfc6bee49f62a160 Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 7 May 2019 16:29:34 +0100 Subject: [PATCH 03/32] moved json dependency to library --- src/shogun/CMakeLists.txt | 2 +- src/shogun/io/OpenMLFlow.cpp | 197 +++++++++++++++++++++++++++++------ src/shogun/io/OpenMLFlow.h | 97 ++++++++++++----- 3 files changed, 236 insertions(+), 60 deletions(-) diff --git a/src/shogun/CMakeLists.txt b/src/shogun/CMakeLists.txt index 8290d3a49b4..506d4863cfd 100644 --- a/src/shogun/CMakeLists.txt +++ b/src/shogun/CMakeLists.txt @@ -412,7 +412,7 @@ SHOGUN_DEPENDENCIES( CONFIG_FLAG HAVE_XML) # RapidJSON include(external/RapidJSON) -SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC ${RAPIDJSON_INCLUDE_DIR}) +SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE ${RAPIDJSON_INCLUDE_DIR}) if (NOT WIN32) # FIXME: HDF5 linking on WIN32 is broken. diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index 505869d6789..a7ef3279f97 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -5,6 +5,10 @@ */ #include +#include +#include + +#include #ifdef HAVE_CURL @@ -59,7 +63,7 @@ void OpenMLReader::openml_curl_request_helper(const std::string& url) if (!curl_handle) { - SG_SERROR("Failed to initialise curl handle.") + SG_SERROR("Failed to initialise curl handle.\n") return; } @@ -82,24 +86,61 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) // TODO: call curl_easy_cleanup(curl_handle) ? SG_SERROR("Curl error: %s\n", curl_easy_strerror(code)) } -// else -// { -// long response_code; -// curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &response_code); - // if (response_code == 200) - // return; - // else - // { - // if (response_code == 181) - // SG_SERROR("Unknown flow. The flow with the given ID was not - //found in the database.") else if (response_code == 180) SG_SERROR("") - // SG_SERROR("Server code: %d\n", response_code) - // } -// } + // else + // { + // long response_code; + // curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, + //&response_code); if (response_code == 200) return; + // else + // { + // if (response_code == 181) + // SG_SERROR("Unknown flow. The flow with the given ID was not + // found in the database.") else if (response_code == 180) + // SG_SERROR("") SG_SERROR("Server code: %d\n", response_code) + // } + // } } -std::shared_ptr -OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key) +#endif // HAVE_CURL + +static void check_flow_response(rapidjson::Document& doc) +{ + if (SG_UNLIKELY(doc.HasMember("error"))) + { + const Value& root = doc["error"]; + SG_SERROR( + "Server error %s: %s\n", root["code"].GetString(), + root["message"].GetString()) + return; + } + REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); +} + +static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericValue>& v, + std::unordered_map& param_dict, + const std::string& name) +{ + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); +} + +static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v, + std::unordered_map& param_dict, + const std::string& name) +{ + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); +} + +std::shared_ptr OpenMLFlow::download_flow( + const std::string& flow_id, const std::string& api_key) { Document document; parameters_type params; @@ -124,7 +165,8 @@ OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key if (root["parameter"].IsArray()) { - for (const auto &v : root["parameter"].GetArray()) { + for (const auto& v : root["parameter"].GetArray()) + { emplace_string_to_map(v, param_dict, "data_type"); emplace_string_to_map(v, param_dict, "default_value"); emplace_string_to_map(v, param_dict, "description"); @@ -146,11 +188,22 @@ OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key // handle components, i.e. kernels if (root.HasMember("component")) { - for (const auto& v : root["component"].GetArray()) + if (root["component"].IsArray()) + { + for (const auto& v : root["component"].GetArray()) + { + components.emplace( + v["identifier"].GetString(), + OpenMLFlow::download_flow( + v["flow"]["id"].GetString(), api_key)); + } + } + else { components.emplace( - v["identifier"].GetString(), - OpenMLFlow::download_flow(v["flow"]["id"].GetString(), api_key)); + root["component"]["identifier"].GetString(), + OpenMLFlow::download_flow( + root["component"]["flow"]["id"].GetString(), api_key)); } } @@ -162,26 +215,106 @@ OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key if (root.HasMember("class_name")) class_name = root["class_name"].GetString(); - auto flow = std::make_shared(name, description, class_name, components, params); + auto flow = std::make_shared( + name, description, class_name, components, params); return flow; } -void OpenMLFlow::check_flow_response(Document& doc) +void OpenMLFlow::upload_flow(const std::shared_ptr& flow) { - if (SG_UNLIKELY(doc.HasMember("error"))) +} + +void OpenMLFlow::dump() +{ +} + +std::shared_ptr OpenMLFlow::from_file() +{ + return std::shared_ptr(); +} + +std::shared_ptr ShogunOpenML::flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults) +{ + std::string name; + std::string val_as_string; + std::shared_ptr obj; + auto params = flow->get_parameters(); + auto components = flow->get_components(); + auto class_name = get_class_info(flow->get_class_name()); + auto module_name = std::get<0>(class_name); + auto algo_name = std::get<1>(class_name); + if (module_name == "machine") + obj = std::shared_ptr(machine(algo_name)); + else if (module_name == "kernel") + obj = std::shared_ptr(kernel(algo_name)); + else if (module_name == "distance") + obj = std::shared_ptr(distance(algo_name)); + else + SG_SERROR("Unsupported factory \"%s\"\n", module_name.c_str()) + auto obj_param = obj->get_params(); + + auto put_lambda = [&obj, &name, &val_as_string](const auto& val) { + // cast value using type from get, i.e. val + auto val_ = char_to_scalar>( + val_as_string.c_str()); + obj->put(name, val_); + }; + + if (initialize_with_defaults) { - const Value& root = doc["error"]; - SG_SERROR( - "Server error %s: %s\n", root["code"].GetString(), - root["message"].GetString()) - return; + for (const auto& param : params) + { + Any any_val = obj_param.at(param.first)->get_value(); + name = param.first; + val_as_string = param.second.at("default_value"); + sg_any_dispatch(any_val, sg_all_typemap, put_lambda); + } } - REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); + + for (const auto& component : components) + { + CSGObject* a = + flow_to_model(component.second, initialize_with_defaults).get(); + // obj->put(component.first, a); + } + + return obj; } -void OpenMLFlow::upload_flow(const std::shared_ptr& flow) +std::shared_ptr +ShogunOpenML::model_to_flow(const std::shared_ptr& model) { + return std::shared_ptr(); } -#endif // HAVE_CURL +std::tuple +ShogunOpenML::get_class_info(const std::string& class_name) +{ + std::vector class_components; + auto begin = class_name.begin(); + std::tuple result; + + for (auto it = class_name.begin(); it != class_name.end(); ++it) + { + if (*it == '.') + { + class_components.emplace_back(std::string(begin, it)); + begin = std::next(it); + } + if (std::next(it) == class_name.end()) + class_components.emplace_back(std::string(begin, std::next(it))); + } + if (class_components.size() != 3) + SG_SERROR("Invalid class name format %s\n", class_name.c_str()) + if (class_components[0] == "shogun") + result = std::make_tuple(class_components[1], class_components[2]); + else + SG_SERROR( + "The provided flow is not meant for shogun deserialisation! The " + "required library is \"%s\"\n", + class_components[0].c_str()) + + return result; +} diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index dee68423269..8fc46594a08 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -15,7 +15,6 @@ #include #include -#include #include #include @@ -150,9 +149,9 @@ namespace shogun public: using components_type = - std::unordered_map>; + std::unordered_map>; using parameters_type = std::unordered_map< - std::string, std::unordered_map>; + std::string, std::unordered_map>; OpenMLFlow( const std::string& name, const std::string& description, @@ -163,13 +162,15 @@ namespace shogun { } - ~OpenMLFlow()= default; - static std::shared_ptr download_flow(const std::string& flow_id, const std::string& api_key); + static std::shared_ptr from_file(); + static void upload_flow(const std::shared_ptr& flow); + void dump(); + std::shared_ptr get_subflow(const std::string& name) { auto find_flow = m_components.find(name); @@ -181,40 +182,82 @@ namespace shogun return nullptr; } +#ifndef SWIG + SG_FORCED_INLINE parameters_type get_parameters() + { + return m_parameters; + } + + SG_FORCED_INLINE components_type get_components() + { + return m_components; + } + + SG_FORCED_INLINE std::string get_class_name() + { + return m_class_name; + } +#endif // SWIG + private: std::string m_name; std::string m_description; std::string m_class_name; parameters_type m_parameters; components_type m_components; + }; #ifndef SWIG - static void check_flow_response(rapidjson::Document& doc); + template + T char_to_scalar(const char* string_val) + { + SG_SERROR( + "No registered conversion from string to type \"s\"\n", + demangled_type().c_str()) + return 0; + } + + template <> + float32_t char_to_scalar(const char* string_val) + { + char* end; + return std::strtof(string_val, &end); + } - static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericValue>& v, - std::unordered_map& param_dict, - const std::string& name) - { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); - } + template <> + float64_t char_to_scalar(const char* string_val) + { + char* end; + return std::strtod(string_val, &end); + } - static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericObject< - true, rapidjson::GenericValue>>& v, - std::unordered_map& param_dict, - const std::string& name) - { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); - } + template <> + floatmax_t char_to_scalar(const char* string_val) + { + char* end; + return std::strtold(string_val, &end); + } + + template <> + bool char_to_scalar(const char* string_val) + { + return strcmp(string_val, "true"); + } #endif // SWIG + + class ShogunOpenML + { + public: + static std::shared_ptr flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults); + + static std::shared_ptr + model_to_flow(const std::shared_ptr& model); + + private: + static std::tuple + get_class_info(const std::string& class_name); }; } // namespace shogun #endif // HAVE_CURL From 0500504ffb9c0ccdaef0f4c51e7e5b9517f918c1 Mon Sep 17 00:00:00 2001 From: gf712 Date: Wed, 8 May 2019 12:25:49 +0100 Subject: [PATCH 04/32] initial ShogunOpenML class --- src/interfaces/swig/IO.i | 2 + src/shogun/base/SGObject.cpp | 1 - src/shogun/io/OpenMLFlow.cpp | 289 +++++++++++++++++++++++++++++------ src/shogun/io/OpenMLFlow.h | 135 +++++++++++----- 4 files changed, 343 insertions(+), 84 deletions(-) diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index 4e59008adf4..b3094311ec3 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -26,6 +26,8 @@ %rename(MemoryMappedFile) CMemoryMappedFile; %shared_ptr(shogun::OpenMLFlow) +%shared_ptr(shogun::ShogunOpenML::flow_to_model) +%shared_ptr(shogun::ShogunOpenML::model_to_flow) %include %include diff --git a/src/shogun/base/SGObject.cpp b/src/shogun/base/SGObject.cpp index b91eff7f709..48ff54a3f64 100644 --- a/src/shogun/base/SGObject.cpp +++ b/src/shogun/base/SGObject.cpp @@ -1112,5 +1112,4 @@ std::string CSGObject::string_enum_reverse_lookup( return p.second == enum_value; }); return enum_map_it->first; - } diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index a7ef3279f97..95a315af945 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -15,6 +15,14 @@ using namespace shogun; using namespace rapidjson; +/** + * The writer callback function used to write the packets to a C++ string. + * @param data the data received in CURL request + * @param size always 1 + * @param nmemb the size of data + * @param buffer_in the buffer to write to + * @return the size of buffer that was written + */ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) { // adapted from https://stackoverflow.com/a/5780603 @@ -30,13 +38,16 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) return 0; } +/* OpenML server format */ const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +/* DATA API */ const char* OpenMLReader::dataset_description = "/data/{}"; const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; const char* OpenMLReader::data_features = "/data/features/{}"; const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; +/* FLOW API */ const char* OpenMLReader::flow_file = "/flow/{}"; const std::unordered_map @@ -84,26 +95,17 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) if (code != CURLE_OK) { // TODO: call curl_easy_cleanup(curl_handle) ? - SG_SERROR("Curl error: %s\n", curl_easy_strerror(code)) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(code)) } - // else - // { - // long response_code; - // curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, - //&response_code); if (response_code == 200) return; - // else - // { - // if (response_code == 181) - // SG_SERROR("Unknown flow. The flow with the given ID was not - // found in the database.") else if (response_code == 180) - // SG_SERROR("") SG_SERROR("Server code: %d\n", response_code) - // } - // } } #endif // HAVE_CURL -static void check_flow_response(rapidjson::Document& doc) +/** + * Checks the returned flow in JSON format + * @param doc the parsed flow + */ +static void check_flow_response(Document& doc) { if (SG_UNLIKELY(doc.HasMember("error"))) { @@ -116,24 +118,36 @@ static void check_flow_response(rapidjson::Document& doc) REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); } +/** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericValue, i.e. string + * @param param_dict the map to write to + * @param name the name of the key + */ static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericValue>& v, + const GenericValue>& v, std::unordered_map& param_dict, const std::string& name) { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + if (v[name.c_str()].GetType() == Type::kStringType) param_dict.emplace(name, v[name.c_str()].GetString()); else param_dict.emplace(name, ""); } +/** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericObject, i.e. array + * @param param_dict the map to write to + * @param name the name of the key + */ static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericObject< - true, rapidjson::GenericValue>>& v, + const GenericObject< + true, GenericValue>>& v, std::unordered_map& param_dict, const std::string& name) { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + if (v[name.c_str()].GetType() == Type::kStringType) param_dict.emplace(name, v[name.c_str()].GetString()); else param_dict.emplace(name, ""); @@ -234,52 +248,235 @@ std::shared_ptr OpenMLFlow::from_file() return std::shared_ptr(); } +/** + * Class using the Any visitor pattern to convert + * a string to a C++ type that can be used as a parameter + * in a Shogun model. + */ +class StringToShogun : public AnyVisitor +{ +public: + explicit StringToShogun(std::shared_ptr model) + : m_model(model), m_parameter(""), m_string_val(""){}; + + StringToShogun( + std::shared_ptr model, const std::string& parameter, + const std::string& string_val) + : m_model(model), m_parameter(parameter), m_string_val(string_val){}; + + void on(bool* v) final + { + if (!is_null()) + { + SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + bool result = strcmp(m_string_val.c_str(), "true") == 0; + m_model->put(m_parameter, result); + } + } + void on(int32_t* v) final + { + if (!is_null()) + { + SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + try + { + int32_t result = std::stoi(m_string_val); + m_model->put(m_parameter, result); + } + catch (const std::invalid_argument&) + { + // it's an option, i.e. internally represented + // as an enum but in swig exposed as a string + m_string_val.erase( + std::remove_if( + m_string_val.begin(), m_string_val.end(), + // remove quotes + [](const auto& val) { return val == '\"'; }), + m_string_val.end()); + m_model->put(m_parameter, m_string_val); + } + } + } + void on(int64_t* v) final + { + if (!is_null()) + { + SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + int64_t result = std::stol(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(float* v) final + { + if (!is_null()) + { + SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + char* end; + float32_t result = std::strtof(m_string_val.c_str(), &end); + m_model->put(m_parameter, result); + } + } + void on(double* v) final + { + if (!is_null()) + { + SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + char* end; + float64_t result = std::strtod(m_string_val.c_str(), &end); + m_model->put(m_parameter, result); + } + } + void on(long double* v) + { + if (!is_null()) + { + SG_SDEBUG("long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + char* end; + floatmax_t result = std::strtold(m_string_val.c_str(), &end); + m_model->put(m_parameter, result); + } + } + void on(CSGObject** v) final + { + SG_SDEBUG("CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + + bool is_null() + { + bool result = strcmp(m_string_val.c_str(), "null") == 0; + return result; + } + + void set_parameter_name(const std::string& name) + { + m_parameter = name; + } + + void set_string_value(const std::string& value) + { + m_string_val = value; + } + +private: + std::shared_ptr m_model; + std::string m_parameter; + std::string m_string_val; +}; + +/** + * Instantiates a CSGObject using a factory + * @param factory_name the name of the factory + * @param algo_name the name of algorithm passed to factory + * @return the instantiated object using a factory + */ +std::shared_ptr instantiate_model_from_factory( + const std::string& factory_name, const std::string& algo_name) +{ + std::shared_ptr obj; + if (factory_name == "machine") + obj = std::shared_ptr(machine(algo_name)); + else if (factory_name == "kernel") + obj = std::shared_ptr(kernel(algo_name)); + else if (factory_name == "distance") + obj = std::shared_ptr(distance(algo_name)); + else + SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + + return obj; +} + +/** + * Downcasts a CSGObject and puts it in the map of obj. + * @param obj the main object + * @param nested_obj the object to be casted and put in the obj map. + * @param parameter_name the name of nested_obj + */ +void cast_and_put( + const std::shared_ptr& obj, + const std::shared_ptr& nested_obj, + const std::string& parameter_name) +{ + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + // TODO: remove clone + // temporary fix until shared_ptr PR merged + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + } + else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + } + else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + } + else + SG_SERROR("Could not cast SGObject.\n") +} + std::shared_ptr ShogunOpenML::flow_to_model( std::shared_ptr flow, bool initialize_with_defaults) { - std::string name; - std::string val_as_string; - std::shared_ptr obj; auto params = flow->get_parameters(); auto components = flow->get_components(); auto class_name = get_class_info(flow->get_class_name()); auto module_name = std::get<0>(class_name); auto algo_name = std::get<1>(class_name); - if (module_name == "machine") - obj = std::shared_ptr(machine(algo_name)); - else if (module_name == "kernel") - obj = std::shared_ptr(kernel(algo_name)); - else if (module_name == "distance") - obj = std::shared_ptr(distance(algo_name)); - else - SG_SERROR("Unsupported factory \"%s\"\n", module_name.c_str()) + + auto obj = instantiate_model_from_factory(module_name, algo_name); auto obj_param = obj->get_params(); - auto put_lambda = [&obj, &name, &val_as_string](const auto& val) { - // cast value using type from get, i.e. val - auto val_ = char_to_scalar>( - val_as_string.c_str()); - obj->put(name, val_); - }; + std::unique_ptr visitor(new StringToShogun(obj)); if (initialize_with_defaults) { for (const auto& param : params) { Any any_val = obj_param.at(param.first)->get_value(); - name = param.first; - val_as_string = param.second.at("default_value"); - sg_any_dispatch(any_val, sg_all_typemap, put_lambda); + std::string name = param.first; + std::string val_as_string = param.second.at("default_value"); + visitor->set_parameter_name(name); + visitor->set_string_value(val_as_string); + any_val.visit(visitor.get()); } } for (const auto& component : components) { - CSGObject* a = - flow_to_model(component.second, initialize_with_defaults).get(); - // obj->put(component.first, a); + std::shared_ptr nested_obj = + flow_to_model(component.second, initialize_with_defaults); + cast_and_put(obj, nested_obj, component.first); } + SG_SDEBUG("Final object: %s.\n", obj->to_string().c_str()); + return obj; } @@ -306,15 +503,15 @@ ShogunOpenML::get_class_info(const std::string& class_name) if (std::next(it) == class_name.end()) class_components.emplace_back(std::string(begin, std::next(it))); } - if (class_components.size() != 3) - SG_SERROR("Invalid class name format %s\n", class_name.c_str()) if (class_components[0] == "shogun") result = std::make_tuple(class_components[1], class_components[2]); else SG_SERROR( "The provided flow is not meant for shogun deserialisation! The " - "required library is \"%s\"\n", + "required library is \"%s\".\n", class_components[0].c_str()) + if (class_components.size() != 3) + SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) return result; } diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 8fc46594a08..8c00ffedb49 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -25,6 +25,9 @@ namespace shogun { + /** + * Reads OpenML streams which can be downloaded with this function. + */ class OpenMLReader { @@ -96,6 +99,7 @@ namespace shogun } private: + /** the raw buffer as a C++ string */ std::string m_curl_response_buffer; /** @@ -114,13 +118,18 @@ namespace shogun */ void openml_curl_error_helper(CURL* curl_handle, CURLcode code); + /** the user API key, not required for all requests */ std::string m_api_key; + /** the server path to get a response in XML format*/ static const char* xml_server; + /** the server path to get a response in JSON format*/ static const char* json_server; + /** the server response format options: XML or JSON */ static const std::unordered_map m_format_options; + /** all the supported server options */ static const std::unordered_map m_request_options; @@ -135,24 +144,48 @@ namespace shogun static const char* flow_file; }; + /** + * Writes OpenML streams to the OpenML server. + */ class OpenMLWritter { public: OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; private: + /** the user API key, likely to be needed to write to OpenML */ std::string m_api_key; }; + /** + * Handles OpenML flows. A flow contains the information + * required to instantiate a model. + */ class OpenMLFlow { public: + /** alias for component type, map of flows */ using components_type = std::unordered_map>; + /** alias for parameter type, map of maps with information specific to a + * parameter */ using parameters_type = std::unordered_map< std::string, std::unordered_map>; + /** + * The OpenMLFlow constructor. This constructor is rarely used by the + * user and is used by the static class members download_flow and + * from_file. The user is expected to use either of the previously + * mentioned functions. + * + * @param name the model name + * @param description the model description + * @param model the flow class_name field + * @param components a map of subflows, i.e. kernels + * @param parameters a map of parameter information, i.e. default values + * for each parameter name + */ OpenMLFlow( const std::string& name, const std::string& description, const std::string& model, components_type components, @@ -162,15 +195,39 @@ namespace shogun { } + /** + * Instantiates a OpenMLFlow by downloaded a flow from the OpenML server. + * + * @param flow_id the flow ID + * @param api_key the user API key (might not be required and can be an empty string) + * @return the OpenMLFlow corresponding to the flow requested + * @throws ShogunException when there is a server error or the requested flow is ill formed. + */ static std::shared_ptr download_flow(const std::string& flow_id, const std::string& api_key); + /** + * Instantiates a OpenMLFlow from a file. + * @return the OpenMLFlow corresponding to the flow requested + */ static std::shared_ptr from_file(); + /** + * Publishes a flow to the OpenML server + * @param flow the flow to be published + */ static void upload_flow(const std::shared_ptr& flow); + /** + * Dumps the OpenMLFlow to disk. + */ void dump(); + /** + * Gets a subflow, i.e. a kernel in a machine + * @param name the name of the subflow, not the flow ID + * @return the subflow if it exists + */ std::shared_ptr get_subflow(const std::string& name) { auto find_flow = m_components.find(name); @@ -200,62 +257,66 @@ namespace shogun #endif // SWIG private: + /** name field of the flow */ std::string m_name; + /** description field of the flow */ std::string m_description; + /** the class_name field of the flow */ std::string m_class_name; + /** the parameter field of the flow (optional) */ parameters_type m_parameters; + /** the components fields of the flow (optional) */ components_type m_components; }; -#ifndef SWIG - template - T char_to_scalar(const char* string_val) - { - SG_SERROR( - "No registered conversion from string to type \"s\"\n", - demangled_type().c_str()) - return 0; - } - - template <> - float32_t char_to_scalar(const char* string_val) - { - char* end; - return std::strtof(string_val, &end); - } - - template <> - float64_t char_to_scalar(const char* string_val) + /** + * Handles OpenML tasks. A task contains all the information + * required to train and test a model. + */ + class OpenMLTask { - char* end; - return std::strtod(string_val, &end); - } - - template <> - floatmax_t char_to_scalar(const char* string_val) - { - char* end; - return std::strtold(string_val, &end); - } - - template <> - bool char_to_scalar(const char* string_val) - { - return strcmp(string_val, "true"); - } - -#endif // SWIG + public: + OpenMLTask(); + }; + /** + * The Shogun OpenML extension to run models from an OpenMLFlow + * and convert models to OpenMLFlow. + */ class ShogunOpenML { public: + /** + * Instantiates a SGObject from an OpenMLFlow. + * + * @param flow the flow to instantiate + * @param initialize_with_defaults whether to use the default values + * specified in the flow + * @return the flow as a trainable model + */ static std::shared_ptr flow_to_model( std::shared_ptr flow, bool initialize_with_defaults); + /** + * Converts a SGObject to an OpenMLFlow. + * + * @param model the model to convert + * @return the flow from the model conversion + */ static std::shared_ptr model_to_flow(const std::shared_ptr& model); private: + /** + * Helper function to extract module/factory information from the class + * name field of OpenMLFlow. Throws an error either if the class name + * field is ill formed (i.e. not library.module.algorithm) or if the + * library name is not "shogun". + * + * @param class_name the flow class_name field + * @return a tuple with the module name (factory string) and the + * algorithm name + */ static std::tuple get_class_info(const std::string& class_name); }; From bbb493a64e5454c3e4ee126cc592cbbd4750b130 Mon Sep 17 00:00:00 2001 From: gf712 Date: Wed, 8 May 2019 14:36:36 +0100 Subject: [PATCH 05/32] started working datasets --- src/interfaces/swig/IO.i | 2 - src/shogun/io/OpenMLFlow.cpp | 403 ++++++++++++++++++++++++++++++----- src/shogun/io/OpenMLFlow.h | 168 ++++++++++++++- 3 files changed, 508 insertions(+), 65 deletions(-) diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index b3094311ec3..4e59008adf4 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -26,8 +26,6 @@ %rename(MemoryMappedFile) CMemoryMappedFile; %shared_ptr(shogun::OpenMLFlow) -%shared_ptr(shogun::ShogunOpenML::flow_to_model) -%shared_ptr(shogun::ShogunOpenML::model_to_flow) %include %include diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index 95a315af945..b8572e05a31 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -10,11 +10,11 @@ #include -#ifdef HAVE_CURL - using namespace shogun; using namespace rapidjson; +#ifdef HAVE_CURL + /** * The writer callback function used to write the packets to a C++ string. * @param data the data received in CURL request @@ -45,10 +45,13 @@ const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; const char* OpenMLReader::dataset_description = "/data/{}"; const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; const char* OpenMLReader::data_features = "/data/features/{}"; +const char* OpenMLReader::data_qualities = "/data/qualities/{}"; const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; /* FLOW API */ const char* OpenMLReader::flow_file = "/flow/{}"; +/* TASK API */ +const char* OpenMLReader::task_file = "/task/{}"; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, @@ -58,9 +61,11 @@ const std::unordered_map {"dataset_description", dataset_description}, {"list_data_qualities", list_data_qualities}, {"data_features", data_features}, + {"data_qualities", data_qualities}, {"list_dataset_qualities", list_dataset_qualities}, {"list_dataset_filter", list_dataset_filter}, - {"flow_file", flow_file}}; + {"flow_file", flow_file}, + {"task_file", task_file}}; OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) { @@ -102,10 +107,10 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) #endif // HAVE_CURL /** - * Checks the returned flow in JSON format - * @param doc the parsed flow + * Checks the returned response from OpenML in JSON format + * @param doc the parsed OpenML JSON format response */ -static void check_flow_response(Document& doc) +static void check_response(const Document& doc, const std::string& type) { if (SG_UNLIKELY(doc.HasMember("error"))) { @@ -115,7 +120,9 @@ static void check_flow_response(Document& doc) root["message"].GetString()) return; } - REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); + REQUIRE( + doc.HasMember(type.c_str()), "Unexpected format of OpenML %s.\n", + type.c_str()); } /** @@ -142,8 +149,7 @@ static SG_FORCED_INLINE void emplace_string_to_map( * @param name the name of the key */ static SG_FORCED_INLINE void emplace_string_to_map( - const GenericObject< - true, GenericValue>>& v, + const GenericObject>>& v, std::unordered_map& param_dict, const std::string& name) { @@ -153,6 +159,55 @@ static SG_FORCED_INLINE void emplace_string_to_map( param_dict.emplace(name, ""); } +template +SG_FORCED_INLINE T return_if_possible( + const std::string& name, + const GenericObject>>& v) +{ + SG_SNOTIMPLEMENTED +} + +template <> +SG_FORCED_INLINE std::string return_if_possible( + const std::string& name, + const GenericObject>>& v) +{ + if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) + return v[name.c_str()].GetString(); + if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) + SG_SERROR("Found member \"%s\" but it is not a string", name.c_str()) + if (!v.HasMember(name.c_str())) + return ""; + SG_SERROR("\"%s\" is not a member of the given object", name.c_str()) + return nullptr; +} + +template <> +SG_FORCED_INLINE std::vector +return_if_possible>( + const std::string& name, + const GenericObject>>& v) +{ + std::vector result; + if (!v.HasMember(name.c_str())) + SG_SERROR("\"%s\" is not a member of the given object", name.c_str()) + if (v[name.c_str()].IsString()) + { + result.emplace_back(v[name.c_str()].GetString()); + } + if (v[name.c_str()].IsArray()) + { + for (const auto& val : v[name.c_str()].GetArray()) + { + if (val.IsString()) + result.emplace_back(val.GetString()); + else + SG_SERROR("Found non string member in \"%s\".\n", name.c_str()) + } + } + return result; +} + std::shared_ptr OpenMLFlow::download_flow( const std::string& flow_id, const std::string& api_key) { @@ -167,7 +222,7 @@ std::shared_ptr OpenMLFlow::download_flow( auto reader = OpenMLReader(api_key); auto return_string = reader.get("flow_file", "json", flow_id); document.Parse(return_string.c_str()); - check_flow_response(document); + check_response(document, "flow"); // store root for convenience. We know it exists from previous check. const Value& root = document["flow"]; @@ -237,21 +292,241 @@ std::shared_ptr OpenMLFlow::download_flow( void OpenMLFlow::upload_flow(const std::shared_ptr& flow) { + SG_SNOTIMPLEMENTED; } void OpenMLFlow::dump() { + SG_SNOTIMPLEMENTED; } std::shared_ptr OpenMLFlow::from_file() { + SG_SNOTIMPLEMENTED; return std::shared_ptr(); } +std::shared_ptr +OpenMLData::get_data(const std::string& id, const std::string& api_key) +{ + // description + Document document; + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("dataset_description", "json", id); + + document.Parse(return_string.c_str()); + check_response(document, "data_set_description"); + + const Value& dataset_description = document["data_set_description"]; + + auto name = return_if_possible( + "name", dataset_description.GetObject()); + auto description = return_if_possible( + "description", dataset_description.GetObject()); + auto data_format = return_if_possible( + "data_format", dataset_description.GetObject()); + auto dataset_id = + return_if_possible("id", dataset_description.GetObject()); + auto version = return_if_possible( + "version", dataset_description.GetObject()); + auto creator = return_if_possible( + "creator", dataset_description.GetObject()); + auto contributor = return_if_possible( + "contributor", dataset_description.GetObject()); + auto collection_date = return_if_possible( + "collection_date", dataset_description.GetObject()); + auto upload_date = return_if_possible( + "upload_date", dataset_description.GetObject()); + auto language = return_if_possible( + "language", dataset_description.GetObject()); + auto licence = return_if_possible( + "licence", dataset_description.GetObject()); + auto url = + return_if_possible("url", dataset_description.GetObject()); + auto default_target_attribute = return_if_possible( + "default_target_attribute", dataset_description.GetObject()); + auto row_id_attribute = return_if_possible( + "row_id_attribute", dataset_description.GetObject()); + auto ignore_attribute = return_if_possible( + "ignore_attribute", dataset_description.GetObject()); + auto version_label = return_if_possible( + "version_label", dataset_description.GetObject()); + auto citation = return_if_possible( + "citation", dataset_description.GetObject()); + auto tags = return_if_possible>( + "tag", dataset_description.GetObject()); + auto visibility = return_if_possible( + "visibility", dataset_description.GetObject()); + auto original_data_url = return_if_possible( + "original_data_url", dataset_description.GetObject()); + auto paper_url = return_if_possible( + "paper_url", dataset_description.GetObject()); + auto update_comment = return_if_possible( + "update_comment", dataset_description.GetObject()); + auto md5_checksum = return_if_possible( + "md5_checksum", dataset_description.GetObject()); + + // features + std::vector> param_vector; + return_string = reader.get("data_features", "json", id); + document.Parse(return_string.c_str()); + check_response(document, "data_features"); + const Value& dataset_features = document["data_features"]; + for (const auto& param : dataset_features.GetArray()) + { + std::unordered_map param_map; + for (const auto& param_descriptors : param.GetObject()) + { + param_map.emplace( + param_descriptors.name.GetString(), + param_descriptors.value.GetString()); + } + param_vector.push_back(param_map); + } + + // qualities + std::vector> qualities_vector; + return_string = reader.get("data_qualities", "json", id); + document.Parse(return_string.c_str()); + check_response(document, "data_qualities"); + const Value& data_qualities = document["data_qualities"]; + for (const auto& param : data_qualities.GetArray()) + { + std::unordered_map param_map; + for (const auto& param_quality : param.GetObject()) + { + param_map.emplace( + param_quality.name.GetString(), + param_quality.value.GetString()); + } + qualities_vector.push_back(param_map); + } + + auto result = std::make_shared( + name, description, data_format, dataset_id, version, creator, + contributor, collection_date, upload_date, language, licence, url, + default_target_attribute, row_id_attribute, ignore_attribute, + version_label, citation, tags, visibility, original_data_url, paper_url, + update_comment, md5_checksum, param_vector, qualities_vector); + + return result; +} + +std::string OpenMLData::get_data_buffer(const std::string& api_key) +{ + SG_SNOTIMPLEMENTED; + return nullptr; +} + +std::shared_ptr +OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) +{ + Document document; + std::string task_name; + std::string task_type_id; + std::shared_ptr openml_dataset; + std::shared_ptr openml_split; + std::pair, std::shared_ptr> + task_descriptor; + + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("task_file", "json", task_id); + + document.Parse(return_string.c_str()); + check_response(document, "task"); + + const Value& root = document["task"]; + + REQUIRE( + task_id == root["task_id"].GetString(), + "Expected downloaded task to have the same id as the requested task " + "id, but got \"%s\", instead of \"%s\".\n", + root["task_id"].GetString(), task_id.c_str()) + + task_name = root["task_name"].GetString(); + OpenMLTask::TaskType task_type = + get_task_from_string(root["task_type"].GetString()); + task_type_id = root["task_type_id"].GetString(); + + // expect two elements in input array: dataset and split + const Value& json_input = root["input"]; + + REQUIRE( + json_input.IsArray(), "Currently the dataset reader can only handle " + "inputs with a dataset and split field.\n") + + auto input_array = json_input.GetArray(); + REQUIRE( + input_array.Size() == 2, + "Currently the dataset reader can only handle inputs with a dataset " + "and split fields. Found %d elements.\n", + input_array.Size()) + + // handle dataset + auto json_dataset = input_array[0].GetObject(); + + if (strcmp(json_dataset["name"].GetString(), "source_data") == 0) + { + auto dataset_info = json_dataset["data_set"].GetObject(); + std::string dataset_id = dataset_info["data_set_id"].GetString(); + std::string target_feature = dataset_info["target_feature"].GetString(); + // openml_dataset = + // std::make_shared(dataset_id, target_feature); + } + else + SG_SERROR("Error parsing the OpenML dataset, could not find the " + "source_data field.\n") + + // handle split + auto json_split = input_array[1].GetObject(); + if (strcmp(json_split["name"].GetString(), "estimation_procedure") == 0) + { + auto split_info = json_dataset["estimation_procedure"].GetObject(); + std::string split_id = split_info["id"].GetString(); + std::string split_type = split_info["type"].GetString(); + std::string split_url = split_info["data_splits_url"].GetString(); + std::unordered_map split_parameters; + for (const auto& param : split_info["parameter"].GetArray()) + { + if (param.Size() == 2) + split_parameters.emplace( + param["name"].GetString(), param["value"].GetString()); + else if (param.Size() == 1) + split_parameters.emplace(param["name"].GetString(), ""); + else + SG_SERROR("Unexpected number of parameters in parameter array " + "of estimation_procedure.\n") + } + openml_split = std::make_shared( + split_id, split_type, split_url, split_parameters); + } + else + SG_SERROR("Error parsing the OpenML dataset, could not find the " + "estimation_procedure field.\n") + + task_descriptor = std::make_pair(openml_dataset, openml_split); + + auto result = std::make_shared( + task_id, task_name, task_type, task_type_id, task_descriptor); + + return result; +} + +OpenMLTask::TaskType +OpenMLTask::get_task_from_string(const std::string& task_type) +{ + if (task_type == "Supervised Classification") + return OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION; + SG_SERROR("OpenMLTask does not supported \"%s\"", task_type.c_str()) +} + /** * Class using the Any visitor pattern to convert * a string to a C++ type that can be used as a parameter - * in a Shogun model. + * in a Shogun model. If the string value is not "null" it will + * be put in its casted type in the given model with the provided parameter + * name. If the value is null nothing happens, i.e. no error is thrown + * and no value is put in model. */ class StringToShogun : public AnyVisitor { @@ -266,18 +541,18 @@ class StringToShogun : public AnyVisitor void on(bool* v) final { + SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) bool result = strcmp(m_string_val.c_str(), "true") == 0; m_model->put(m_parameter, result); } } void on(int32_t* v) final { + SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) try { int32_t result = std::stoi(m_string_val); @@ -299,84 +574,94 @@ class StringToShogun : public AnyVisitor } void on(int64_t* v) final { + SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + int64_t result = std::stol(m_string_val); m_model->put(m_parameter, result); } } void on(float* v) final { + SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - char* end; - float32_t result = std::strtof(m_string_val.c_str(), &end); + float32_t result = std::stof(m_string_val); m_model->put(m_parameter, result); } } void on(double* v) final { + SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - char* end; - float64_t result = std::strtod(m_string_val.c_str(), &end); + float64_t result = std::stod(m_string_val); m_model->put(m_parameter, result); } } void on(long double* v) { + SG_SDEBUG( + "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - char* end; - floatmax_t result = std::strtold(m_string_val.c_str(), &end); + floatmax_t result = std::stold(m_string_val); m_model->put(m_parameter, result); } } void on(CSGObject** v) final { - SG_SDEBUG("CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGVector* v) final { - SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGVector* v) final { - SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGVector* v) final { - SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGMatrix* mat) final { - SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGMatrix* mat) final { - SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - } - void on(SGMatrix* mat) final - { - SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } - - bool is_null() + void on(SGMatrix* mat) final{SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} + + /** + * In OpenML "null" is an empty parameter value field. + * @return whether the field is "null" + */ + SG_FORCED_INLINE bool is_null() { bool result = strcmp(m_string_val.c_str(), "null") == 0; return result; } - void set_parameter_name(const std::string& name) + SG_FORCED_INLINE void set_parameter_name(const std::string& name) { m_parameter = name; } - void set_string_value(const std::string& value) + SG_FORCED_INLINE void set_string_value(const std::string& value) { m_string_val = value; } @@ -396,17 +681,16 @@ class StringToShogun : public AnyVisitor std::shared_ptr instantiate_model_from_factory( const std::string& factory_name, const std::string& algo_name) { - std::shared_ptr obj; if (factory_name == "machine") - obj = std::shared_ptr(machine(algo_name)); - else if (factory_name == "kernel") - obj = std::shared_ptr(kernel(algo_name)); - else if (factory_name == "distance") - obj = std::shared_ptr(distance(algo_name)); - else - SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + return std::shared_ptr(machine(algo_name)); + if (factory_name == "kernel") + return std::shared_ptr(kernel(algo_name)); + if (factory_name == "distance") + return std::shared_ptr(distance(algo_name)); - return obj; + SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + + return nullptr; } /** @@ -426,19 +710,21 @@ void cast_and_put( // temporary fix until shared_ptr PR merged auto* tmp_clone = dynamic_cast(casted_obj->clone()); obj->put(parameter_name, tmp_clone); + return; } - else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) { auto* tmp_clone = dynamic_cast(casted_obj->clone()); obj->put(parameter_name, tmp_clone); + return; } - else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) { auto* tmp_clone = dynamic_cast(casted_obj->clone()); obj->put(parameter_name, tmp_clone); + return; } - else - SG_SERROR("Could not cast SGObject.\n") + SG_SERROR("Could not cast SGObject.\n") } std::shared_ptr ShogunOpenML::flow_to_model( @@ -447,8 +733,8 @@ std::shared_ptr ShogunOpenML::flow_to_model( auto params = flow->get_parameters(); auto components = flow->get_components(); auto class_name = get_class_info(flow->get_class_name()); - auto module_name = std::get<0>(class_name); - auto algo_name = std::get<1>(class_name); + auto module_name = class_name.first; + auto algo_name = class_name.second; auto obj = instantiate_model_from_factory(module_name, algo_name); auto obj_param = obj->get_params(); @@ -486,12 +772,12 @@ ShogunOpenML::model_to_flow(const std::shared_ptr& model) return std::shared_ptr(); } -std::tuple +std::pair ShogunOpenML::get_class_info(const std::string& class_name) { std::vector class_components; auto begin = class_name.begin(); - std::tuple result; + std::pair result; for (auto it = class_name.begin(); it != class_name.end(); ++it) { @@ -503,15 +789,16 @@ ShogunOpenML::get_class_info(const std::string& class_name) if (std::next(it) == class_name.end()) class_components.emplace_back(std::string(begin, std::next(it))); } - if (class_components[0] == "shogun") - result = std::make_tuple(class_components[1], class_components[2]); + + if (class_components[0] == "shogun" && class_components.size() == 3) + result = std::make_pair(class_components[1], class_components[2]); + else if (class_components[0] == "shogun" && class_components.size() != 3) + SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) else SG_SERROR( "The provided flow is not meant for shogun deserialisation! The " "required library is \"%s\".\n", class_components[0].c_str()) - if (class_components.size() != 3) - SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) return result; } diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 8c00ffedb49..8c3821332ab 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -21,6 +21,7 @@ #include #include #include +#include #include namespace shogun @@ -50,6 +51,7 @@ namespace shogun std::string get(const std::string& request, const std::string& format, Args... args) { + m_curl_response_buffer.clear(); auto find_format = m_format_options.find(format); if (find_format == m_format_options.end()) { @@ -137,11 +139,15 @@ namespace shogun static const char* dataset_description; static const char* list_data_qualities; static const char* data_features; + static const char* data_qualities; static const char* list_dataset_qualities; static const char* list_dataset_filter; /* FLOW API */ static const char* flow_file; + + /* TASK API */ + static const char* task_file; }; /** @@ -196,12 +202,15 @@ namespace shogun } /** - * Instantiates a OpenMLFlow by downloaded a flow from the OpenML server. + * Instantiates a OpenMLFlow by downloaded a flow from the OpenML + * server. * * @param flow_id the flow ID - * @param api_key the user API key (might not be required and can be an empty string) + * @param api_key the user API key (might not be required and can be an + * empty string) * @return the OpenMLFlow corresponding to the flow requested - * @throws ShogunException when there is a server error or the requested flow is ill formed. + * @throws ShogunException when there is a server error or the requested + * flow is ill formed. */ static std::shared_ptr download_flow(const std::string& flow_id, const std::string& api_key); @@ -269,6 +278,116 @@ namespace shogun components_type m_components; }; + /** + * Handles an OpenML dataset. + */ + class OpenMLData + { + public: + OpenMLData( + const std::string& name, const std::string& description, + const std::string& data_format, const std::string& dataset_id, + const std::string& version, const std::string& creator, + const std::string& contributor, const std::string& collection_date, + const std::string& upload_date, const std::string& language, + const std::string& license, const std::string& url, + const std::string& default_target_attribute, + const std::string& row_id_attribute, + const std::string& ignore_attribute, + const std::string& version_label, const std::string& citation, + std::vector tag, const std::string& visibility, + const std::string& original_data_url, const std::string& paper_url, + const std::string& update_comment, const std::string& md5_checksum, + std::vector> + param_descriptors, + std::vector> + param_qualities) + + : m_name(name), m_description(description), + m_data_format(data_format), m_dataset_id(dataset_id), + m_version(version), m_creator(creator), + m_contributor(contributor), m_collection_date(collection_date), + m_upload_date(upload_date), m_language(language), + m_license(license), m_url(url), + m_default_target_attribute(default_target_attribute), + m_row_id_attribute(row_id_attribute), + m_ignore_attribute(ignore_attribute), + m_version_label(version_label), m_citation(citation), + m_tag(std::move(tag)), m_visibility(visibility), + m_original_data_url(original_data_url), m_paper_url(paper_url), + m_update_comment(update_comment), m_md5_checksum(md5_checksum), + m_param_descriptors(std::move(param_descriptors)), + m_param_qualities(std::move(param_qualities)) + { + } + + /** + * Creates a dataset instance from a given ID. + * + */ + static std::shared_ptr + get_data(const std::string& id, const std::string& api_key); + + /** + * Returns the dataset + * @param api_key + * @return + */ + std::string get_data_buffer(const std::string& api_key); + + private: + std::string m_name; + std::string m_description; + std::string m_data_format; + std::string m_dataset_id; + std::string m_version; + std::string m_creator; + std::string m_contributor; + std::string m_collection_date; + std::string m_upload_date; + std::string m_language; + std::string m_license; + std::string m_url; + std::string m_default_target_attribute; + std::string m_row_id_attribute; + std::string m_ignore_attribute; + std::string m_version_label; + std::string m_citation; + std::vector m_tag; + std::string m_visibility; + std::string m_original_data_url; + std::string m_paper_url; + std::string m_update_comment; + std::string m_md5_checksum; + std::vector> + m_param_descriptors; + std::vector> + m_param_qualities; + }; + + /** + * Handles an OpenML split. + */ + class OpenMLSplit + { + public: + OpenMLSplit( + const std::string& split_id, const std::string& split_type, + const std::string& split_url, + const std::unordered_map& + split_parameters) + : m_split_id(split_id), m_split_type(split_type), + m_split_url(split_url), m_parameters(split_parameters) + { + } + + private: + std::string m_split_id; + std::string m_split_type; + std::string m_split_url; + std::unordered_map m_parameters; + }; + /** * Handles OpenML tasks. A task contains all the information * required to train and test a model. @@ -276,7 +395,46 @@ namespace shogun class OpenMLTask { public: - OpenMLTask(); + enum TaskType + { + SUPERVISED_CLASSIFICATION = 0, + SUPERVISED_REGRESSION = 1, + LEARNING_CURVE = 2, + SUPERVISED_DATASTREAM_CLASSIFICATION = 3, + CLUSTERING = 4, + MACHINE_LEARNING_CHALLENGE = 5, + SURVIVAL_ANALYSIS = 6, + SUBGROUP_DISCOVERY = 7 + }; + OpenMLTask( + const std::string& task_id, const std::string task_name, + TaskType task_type, const std::string& task_type_id, + const std::pair< + std::shared_ptr, std::shared_ptr>& + task_descriptor) + : m_task_id(task_id), m_task_name(task_name), + m_task_type(task_type), m_task_type_id(task_type_id), + m_task_descriptor(task_descriptor) + { + } + + static std::shared_ptr + get_task(const std::string& task_id, const std::string& api_key); + + std::shared_ptr get_dataset() + { + return m_task_descriptor.first; + } + + private: + static TaskType get_task_from_string(const std::string& task_type); + + std::string m_task_id; + std::string m_task_name; + TaskType m_task_type; + std::string m_task_type_id; + std::pair, std::shared_ptr> + m_task_descriptor; }; /** @@ -317,7 +475,7 @@ namespace shogun * @return a tuple with the module name (factory string) and the * algorithm name */ - static std::tuple + static std::pair get_class_info(const std::string& class_name); }; } // namespace shogun From 4c988bbb3ad559b8a1b5fbc88f19be5dc6497c3b Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 14 May 2019 13:54:48 +0100 Subject: [PATCH 06/32] started work on splits --- src/shogun/io/OpenMLFlow.cpp | 241 +++++++++++++++++++++++++---------- src/shogun/io/OpenMLFlow.h | 156 +++++++++++++++++++---- 2 files changed, 309 insertions(+), 88 deletions(-) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index b8572e05a31..bb12fc7199e 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -41,6 +41,8 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) /* OpenML server format */ const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; + /* DATA API */ const char* OpenMLReader::dataset_description = "/data/{}"; const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; @@ -52,10 +54,13 @@ const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; const char* OpenMLReader::flow_file = "/flow/{}"; /* TASK API */ const char* OpenMLReader::task_file = "/task/{}"; +/* SPLIT API */ +const char* OpenMLReader::get_split = "/split/{}"; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, - {"json", json_server}}; + {"json", json_server}, + {"split", splits_server}}; const std::unordered_map OpenMLReader::m_request_options = { {"dataset_description", dataset_description}, @@ -104,8 +109,6 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) } } -#endif // HAVE_CURL - /** * Checks the returned response from OpenML in JSON format * @param doc the parsed OpenML JSON format response @@ -367,19 +370,25 @@ OpenMLData::get_data(const std::string& id, const std::string& api_key) "md5_checksum", dataset_description.GetObject()); // features - std::vector> param_vector; + std::vector>> + param_vector; return_string = reader.get("data_features", "json", id); document.Parse(return_string.c_str()); check_response(document, "data_features"); const Value& dataset_features = document["data_features"]; - for (const auto& param : dataset_features.GetArray()) + for (const auto& param : dataset_features["feature"].GetArray()) { - std::unordered_map param_map; + std::unordered_map> param_map; for (const auto& param_descriptors : param.GetObject()) { - param_map.emplace( - param_descriptors.name.GetString(), - param_descriptors.value.GetString()); + std::vector second; + if (param_descriptors.value.IsArray()) + for (const auto& v : param_descriptors.value.GetArray()) + second.emplace_back(v.GetString()); + else + second.emplace_back(param_descriptors.value.GetString()); + + param_map.emplace(param_descriptors.name.GetString(), second); } param_vector.push_back(param_map); } @@ -390,14 +399,17 @@ OpenMLData::get_data(const std::string& id, const std::string& api_key) document.Parse(return_string.c_str()); check_response(document, "data_qualities"); const Value& data_qualities = document["data_qualities"]; - for (const auto& param : data_qualities.GetArray()) + for (const auto& param : data_qualities["quality"].GetArray()) { std::unordered_map param_map; for (const auto& param_quality : param.GetObject()) { - param_map.emplace( - param_quality.name.GetString(), - param_quality.value.GetString()); + if (param_quality.name.IsString() && param_quality.value.IsString()) + param_map.emplace( + param_quality.name.GetString(), + param_quality.value.GetString()); + else if (param_quality.name.IsString()) + param_map.emplace(param_quality.name.GetString(), ""); } qualities_vector.push_back(param_map); } @@ -418,16 +430,28 @@ std::string OpenMLData::get_data_buffer(const std::string& api_key) return nullptr; } +std::shared_ptr +OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) +{ + Document document; + + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("get_split", "split", split_url); + auto return_stream = std::istringstream(return_string); + // add ARFF parsing here + SG_SNOTIMPLEMENTED + return nullptr; +} + std::shared_ptr OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) { Document document; std::string task_name; std::string task_type_id; - std::shared_ptr openml_dataset; - std::shared_ptr openml_split; - std::pair, std::shared_ptr> - task_descriptor; + std::shared_ptr openml_dataset = nullptr; + std::shared_ptr openml_split = nullptr; + std::unordered_map evaluation_measures; auto reader = OpenMLReader(api_key); auto return_string = reader.get("task_file", "json", task_id); @@ -451,63 +475,62 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) // expect two elements in input array: dataset and split const Value& json_input = root["input"]; - REQUIRE( - json_input.IsArray(), "Currently the dataset reader can only handle " - "inputs with a dataset and split field.\n") - auto input_array = json_input.GetArray(); - REQUIRE( - input_array.Size() == 2, - "Currently the dataset reader can only handle inputs with a dataset " - "and split fields. Found %d elements.\n", - input_array.Size()) - - // handle dataset - auto json_dataset = input_array[0].GetObject(); - if (strcmp(json_dataset["name"].GetString(), "source_data") == 0) + for (const auto& task_settings : input_array) { - auto dataset_info = json_dataset["data_set"].GetObject(); - std::string dataset_id = dataset_info["data_set_id"].GetString(); - std::string target_feature = dataset_info["target_feature"].GetString(); - // openml_dataset = - // std::make_shared(dataset_id, target_feature); - } - else - SG_SERROR("Error parsing the OpenML dataset, could not find the " - "source_data field.\n") - - // handle split - auto json_split = input_array[1].GetObject(); - if (strcmp(json_split["name"].GetString(), "estimation_procedure") == 0) - { - auto split_info = json_dataset["estimation_procedure"].GetObject(); - std::string split_id = split_info["id"].GetString(); - std::string split_type = split_info["type"].GetString(); - std::string split_url = split_info["data_splits_url"].GetString(); - std::unordered_map split_parameters; - for (const auto& param : split_info["parameter"].GetArray()) + if (strcmp(task_settings["name"].GetString(), "source_data") == 0) { - if (param.Size() == 2) - split_parameters.emplace( - param["name"].GetString(), param["value"].GetString()); - else if (param.Size() == 1) - split_parameters.emplace(param["name"].GetString(), ""); - else - SG_SERROR("Unexpected number of parameters in parameter array " - "of estimation_procedure.\n") + auto dataset_info = task_settings["data_set"].GetObject(); + std::string dataset_id = dataset_info["data_set_id"].GetString(); + std::string target_feature = + dataset_info["target_feature"].GetString(); + openml_dataset = OpenMLData::get_data(dataset_id, api_key); + } + else if ( + strcmp(task_settings["name"].GetString(), "estimation_procedure") == + 0) + { + auto split_info = task_settings["estimation_procedure"].GetObject(); + std::string split_id = split_info["id"].GetString(); + std::string split_type = split_info["type"].GetString(); + std::string split_url = split_info["data_splits_url"].GetString(); + std::unordered_map split_parameters; + for (const auto& param : split_info["parameter"].GetArray()) + { + if (param.HasMember("name") && param.HasMember("value")) + split_parameters.emplace( + param["name"].GetString(), param["value"].GetString()); + else if (param.HasMember("name")) + split_parameters.emplace(param["name"].GetString(), ""); + else + SG_SERROR( + "Unexpected number of parameters in parameter array " + "of estimation_procedure.\n") + } + openml_split = std::make_shared( + split_id, split_type, split_url, split_parameters); + } + else if ( + strcmp(task_settings["name"].GetString(), "evaluation_measures") == + 0) + { + auto evaluation_info = + task_settings["evaluation_measures"].GetObject(); + for (const auto& param : evaluation_info) + { + evaluation_measures.emplace( + param.name.GetString(), param.value.GetString()); + } } - openml_split = std::make_shared( - split_id, split_type, split_url, split_parameters); } - else - SG_SERROR("Error parsing the OpenML dataset, could not find the " - "estimation_procedure field.\n") - task_descriptor = std::make_pair(openml_dataset, openml_split); + if (openml_dataset == nullptr && openml_split == nullptr) + SG_SERROR("Error parsing task.") auto result = std::make_shared( - task_id, task_name, task_type, task_type_id, task_descriptor); + task_id, task_name, task_type, task_type_id, evaluation_measures, + openml_split, openml_dataset); return result; } @@ -517,7 +540,19 @@ OpenMLTask::get_task_from_string(const std::string& task_type) { if (task_type == "Supervised Classification") return OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION; - SG_SERROR("OpenMLTask does not supported \"%s\"", task_type.c_str()) + SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) +} + +SGMatrix OpenMLTask::get_train_indices() +{ + SG_SNOTIMPLEMENTED + return SGMatrix(); +} + +SGMatrix OpenMLTask::get_test_indices() +{ + SG_SNOTIMPLEMENTED + return SGMatrix(); } /** @@ -802,3 +837,77 @@ ShogunOpenML::get_class_info(const std::string& class_name) return result; } + +CLabels* ShogunOpenML::run_model_on_fold( + const std::shared_ptr& model, + const std::shared_ptr& task, CFeatures* X_train, + index_t repeat_number, index_t fold_number, CLabels* y_train, + CFeatures* X_test) +{ + auto task_type = task->get_task_type(); + auto model_clone = std::shared_ptr(model->clone()); + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + if (auto machine = std::dynamic_pointer_cast(model_clone)) + { + machine->put("labels", y_train); + machine->train(X_train); + return machine->apply(X_test); + } + else + SG_SERROR("The provided model is not trainable!\n") + } + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + return nullptr; +} + +std::shared_ptr OpenMLRun::run_model_on_task( + std::shared_ptr model, std::shared_ptr task) +{ + SG_SNOTIMPLEMENTED + return std::shared_ptr(); +} + +std::shared_ptr OpenMLRun::run_flow_on_task( + std::shared_ptr flow, std::shared_ptr task) +{ + auto data = task->get_dataset(); + SG_SNOTIMPLEMENTED + return std::shared_ptr(); +} + +std::shared_ptr +OpenMLRun::from_filesystem(const std::string& directory) +{ + SG_SNOTIMPLEMENTED + return nullptr; +} + +void OpenMLRun::to_filesystem(const std::string& directory) const +{ + SG_SNOTIMPLEMENTED +} + +void OpenMLRun::publish() const +{ + SG_SNOTIMPLEMENTED +} + +#endif // HAVE_CURL diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 8c3821332ab..2232d6562fd 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -16,7 +16,6 @@ #include -#include #include #include #include @@ -51,6 +50,7 @@ namespace shogun std::string get(const std::string& request, const std::string& format, Args... args) { + std::string request_path; m_curl_response_buffer.clear(); auto find_format = m_format_options.find(format); if (find_format == m_format_options.end()) @@ -59,15 +59,27 @@ namespace shogun "The provided format \"%s\" is not available\n", format.c_str()) } - auto find_request = m_request_options.find(request); - if (find_request == m_request_options.end()) + + if (format == "split") { - SG_SERROR( - "Could not find a way to solve the request \"%s\"\n", - request.c_str()) + REQUIRE( + request == "get_split", + "Split server can only handle \"get_split\" request.\n") + request_path = get_split; } + else + { + auto find_request = m_request_options.find(request); + if (find_request == m_request_options.end()) + { + SG_SERROR( + "Could not find a way to solve the request \"%s\"\n", + request.c_str()) + } + request_path = find_request->second; + } + std::string request_format = find_format->second; - std::string request_path = find_request->second; // get additional args and concatenate them with "/" if (sizeof...(Args) > 0) @@ -127,6 +139,8 @@ namespace shogun static const char* xml_server; /** the server path to get a response in JSON format*/ static const char* json_server; + /** the server path to get a split in ARFF format */ + static const char* splits_server; /** the server response format options: XML or JSON */ static const std::unordered_map @@ -148,6 +162,9 @@ namespace shogun /* TASK API */ static const char* task_file; + + /* SPLIT API */ + static const char* get_split; }; /** @@ -298,7 +315,8 @@ namespace shogun std::vector tag, const std::string& visibility, const std::string& original_data_url, const std::string& paper_url, const std::string& update_comment, const std::string& md5_checksum, - std::vector> + std::vector< + std::unordered_map>> param_descriptors, std::vector> param_qualities) @@ -359,7 +377,7 @@ namespace shogun std::string m_paper_url; std::string m_update_comment; std::string m_md5_checksum; - std::vector> + std::vector>> m_param_descriptors; std::vector> m_param_qualities; @@ -381,6 +399,9 @@ namespace shogun { } + static std::shared_ptr + get_split(const std::string& split_url, const std::string& api_key); + private: std::string m_split_id; std::string m_split_type; @@ -395,7 +416,7 @@ namespace shogun class OpenMLTask { public: - enum TaskType + enum class TaskType { SUPERVISED_CLASSIFICATION = 0, SUPERVISED_REGRESSION = 1, @@ -406,26 +427,49 @@ namespace shogun SURVIVAL_ANALYSIS = 6, SUBGROUP_DISCOVERY = 7 }; + + enum class TaskEvaluation + { + + }; + OpenMLTask( const std::string& task_id, const std::string task_name, TaskType task_type, const std::string& task_type_id, - const std::pair< - std::shared_ptr, std::shared_ptr>& - task_descriptor) + std::unordered_map evaluation_measures, + std::shared_ptr split, + std::shared_ptr data) : m_task_id(task_id), m_task_name(task_name), m_task_type(task_type), m_task_type_id(task_type_id), - m_task_descriptor(task_descriptor) + m_evaluation_measures(evaluation_measures), m_split(split), + m_data(data) { } static std::shared_ptr get_task(const std::string& task_id, const std::string& api_key); - std::shared_ptr get_dataset() + std::shared_ptr get_dataset() const noexcept + { + return m_data; + } + + std::shared_ptr get_split() const noexcept { - return m_task_descriptor.first; + return m_split; } + SGMatrix get_train_indices(); + + SGMatrix get_test_indices(); + +#ifndef SWIG + SG_FORCED_INLINE TaskType get_task_type() const noexcept + { + return m_task_type; + } +#endif // SWIG + private: static TaskType get_task_from_string(const std::string& task_type); @@ -433,8 +477,9 @@ namespace shogun std::string m_task_name; TaskType m_task_type; std::string m_task_type_id; - std::pair, std::shared_ptr> - m_task_descriptor; + std::unordered_map m_evaluation_measures; + std::shared_ptr m_split; + std::shared_ptr m_data; }; /** @@ -464,12 +509,19 @@ namespace shogun static std::shared_ptr model_to_flow(const std::shared_ptr& model); + protected: + CLabels* run_model_on_fold( + const std::shared_ptr& model, + const std::shared_ptr& task, CFeatures* X_train, + index_t repeat_number, index_t fold_number, CLabels* y_train, + CFeatures* X_test); + private: /** - * Helper function to extract module/factory information from the class - * name field of OpenMLFlow. Throws an error either if the class name - * field is ill formed (i.e. not library.module.algorithm) or if the - * library name is not "shogun". + * Helper function to extract module/factory information from the + * class name field of OpenMLFlow. Throws an error either if the + * class name field is ill formed (i.e. not + * library.module.algorithm) or if the library name is not "shogun". * * @param class_name the flow class_name field * @return a tuple with the module name (factory string) and the @@ -478,6 +530,66 @@ namespace shogun static std::pair get_class_info(const std::string& class_name); }; + + class OpenMLRun + { + public: + OpenMLRun( + const std::string& uploader, const std::string& uploader_name, + const std::string& setup_id, const std::string& setup_string, + const std::string& parameter_settings, + std::vector evaluations, + std::vector fold_evaluations, + std::vector sample_evaluations, + const std::string& data_content, + std::vector output_files, + std::shared_ptr task, std::shared_ptr flow, + const std::string& run_id, std::shared_ptr model, + std::vector tags, std::string predictions_url) + : m_uploader(uploader), m_uploader_name(uploader_name), + m_setup_id(setup_id), m_setup_string(setup_string), + m_parameter_settings(parameter_settings), + m_evaluations(std::move(evaluations)), + m_fold_evaluations(std::move(fold_evaluations)), + m_sample_evaluations(std::move(sample_evaluations)), + m_data_content(data_content), + m_output_files(std::move(output_files)), m_task(task), + m_flow(flow), m_run_id(run_id), m_model(model), m_tags(tags), + m_predictions_url(predictions_url) + { + } + + static std::shared_ptr + from_filesystem(const std::string& directory); + + static std::shared_ptr run_flow_on_task( + std::shared_ptr flow, std::shared_ptr task); + + static std::shared_ptr run_model_on_task( + std::shared_ptr model, std::shared_ptr task); + + void to_filesystem(const std::string& directory) const; + + void publish() const; + + private: + std::string m_uploader; + std::string m_uploader_name; + std::string m_setup_id; + std::string m_setup_string; + std::string m_parameter_settings; + std::vector m_evaluations; + std::vector m_fold_evaluations; + std::vector m_sample_evaluations; + std::string m_data_content; + std::vector m_output_files; + std::shared_ptr m_task; + std::shared_ptr m_flow; + std::string m_run_id; + std::shared_ptr m_model; + std::vector m_tags; + std::string m_predictions_url; + }; } // namespace shogun #endif // HAVE_CURL From 859786b6d7464c99ad0b686305c4a3331c32a628 Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 14 May 2019 17:19:03 +0100 Subject: [PATCH 07/32] removed curl from header file --- src/shogun/io/OpenMLFlow.cpp | 50 ++++++++++++------------------- src/shogun/io/OpenMLFlow.h | 57 ++++++++++++++++-------------------- 2 files changed, 44 insertions(+), 63 deletions(-) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index bb12fc7199e..590711823df 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -5,16 +5,16 @@ */ #include -#include #include #include +#ifdef HAVE_CURL +#include +#endif // HAVE_CURL using namespace shogun; using namespace rapidjson; -#ifdef HAVE_CURL - /** * The writer callback function used to write the packets to a C++ string. * @param data the data received in CURL request @@ -25,16 +25,14 @@ using namespace rapidjson; */ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) { - // adapted from https://stackoverflow.com/a/5780603 - // Is there anything in the buffer? - if (buffer_in->empty()) + // check that the buffer string points to something + if (buffer_in != nullptr) { // Append the data to the buffer buffer_in->append(data, size * nmemb); return size * nmemb; } - return 0; } @@ -55,7 +53,7 @@ const char* OpenMLReader::flow_file = "/flow/{}"; /* TASK API */ const char* OpenMLReader::task_file = "/task/{}"; /* SPLIT API */ -const char* OpenMLReader::get_split = "/split/{}"; +const char* OpenMLReader::get_split = "/get/{}"; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, @@ -72,12 +70,9 @@ const std::unordered_map {"flow_file", flow_file}, {"task_file", task_file}}; -OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) -{ -} - void OpenMLReader::openml_curl_request_helper(const std::string& url) { +#ifdef HAVE_CURL CURL* curl_handle = nullptr; curl_handle = curl_easy_init(); @@ -95,18 +90,11 @@ void OpenMLReader::openml_curl_request_helper(const std::string& url) CURLcode res = curl_easy_perform(curl_handle); - openml_curl_error_helper(curl_handle, res); + if (res != CURLE_OK) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) curl_easy_cleanup(curl_handle); -} - -void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) -{ - if (code != CURLE_OK) - { - // TODO: call curl_easy_cleanup(curl_handle) ? - SG_SERROR("Connection error: %s.\n", curl_easy_strerror(code)) - } +#endif // HAVE_CURL } /** @@ -298,7 +286,7 @@ void OpenMLFlow::upload_flow(const std::shared_ptr& flow) SG_SNOTIMPLEMENTED; } -void OpenMLFlow::dump() +void OpenMLFlow::dump() const { SG_SNOTIMPLEMENTED; } @@ -543,13 +531,13 @@ OpenMLTask::get_task_from_string(const std::string& task_type) SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) } -SGMatrix OpenMLTask::get_train_indices() +SGMatrix OpenMLTask::get_train_indices() const { SG_SNOTIMPLEMENTED return SGMatrix(); } -SGMatrix OpenMLTask::get_test_indices() +SGMatrix OpenMLTask::get_test_indices() const { SG_SNOTIMPLEMENTED return SGMatrix(); @@ -685,18 +673,18 @@ class StringToShogun : public AnyVisitor * In OpenML "null" is an empty parameter value field. * @return whether the field is "null" */ - SG_FORCED_INLINE bool is_null() + SG_FORCED_INLINE bool is_null() const noexcept { bool result = strcmp(m_string_val.c_str(), "null") == 0; return result; } - SG_FORCED_INLINE void set_parameter_name(const std::string& name) + SG_FORCED_INLINE void set_parameter_name(const std::string& name) noexcept { m_parameter = name; } - SG_FORCED_INLINE void set_string_value(const std::string& value) + SG_FORCED_INLINE void set_string_value(const std::string& value) noexcept { m_string_val = value; } @@ -774,7 +762,7 @@ std::shared_ptr ShogunOpenML::flow_to_model( auto obj = instantiate_model_from_factory(module_name, algo_name); auto obj_param = obj->get_params(); - std::unique_ptr visitor(new StringToShogun(obj)); + auto visitor = std::make_unique(obj); if (initialize_with_defaults) { @@ -859,7 +847,7 @@ CLabels* ShogunOpenML::run_model_on_fold( return machine->apply(X_test); } else - SG_SERROR("The provided model is not trainable!\n") + SG_SERROR("The provided model is not a trainable machine!\n") } break; case OpenMLTask::TaskType::LEARNING_CURVE: @@ -909,5 +897,3 @@ void OpenMLRun::publish() const { SG_SNOTIMPLEMENTED } - -#endif // HAVE_CURL diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 2232d6562fd..45086d44e94 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -9,13 +9,9 @@ #include -#ifdef HAVE_CURL - #include #include -#include - #include #include #include @@ -32,7 +28,9 @@ namespace shogun { public: - explicit OpenMLReader(const std::string& api_key); + explicit OpenMLReader(const std::string& api_key) : m_api_key(api_key) + { + } /** * Returns a string returned by the server given a request. @@ -50,7 +48,9 @@ namespace shogun std::string get(const std::string& request, const std::string& format, Args... args) { +#ifdef HAVE_CURL std::string request_path; + // clear the buffer before request m_curl_response_buffer.clear(); auto find_format = m_format_options.find(format); if (find_format == m_format_options.end()) @@ -63,8 +63,8 @@ namespace shogun if (format == "split") { REQUIRE( - request == "get_split", - "Split server can only handle \"get_split\" request.\n") + request == "get_split", + "Split server can only handle \"get_split\" request.\n") request_path = get_split; } else @@ -73,8 +73,8 @@ namespace shogun if (find_request == m_request_options.end()) { SG_SERROR( - "Could not find a way to solve the request \"%s\"\n", - request.c_str()) + "Could not find a way to solve the request \"%s\"\n", + request.c_str()) } request_path = find_request->second; } @@ -110,6 +110,9 @@ namespace shogun openml_curl_request_helper(url); return m_curl_response_buffer; +#else + SG_SERROR("This function is only available witht the CURL library!\n") +#endif // HAVE_CURL } private: @@ -124,14 +127,6 @@ namespace shogun */ void openml_curl_request_helper(const std::string& url); - /** - * Handles all possible codes - * - * @param curl_handle curl handle used in the request - * @param code the code returned by the query - */ - void openml_curl_error_helper(CURL* curl_handle, CURLcode code); - /** the user API key, not required for all requests */ std::string m_api_key; @@ -214,7 +209,8 @@ namespace shogun const std::string& model, components_type components, parameters_type parameters) : m_name(name), m_description(description), m_class_name(model), - m_parameters(parameters), m_components(components) + m_parameters(std::move(parameters)), + m_components(std::move(components)) { } @@ -247,7 +243,7 @@ namespace shogun /** * Dumps the OpenMLFlow to disk. */ - void dump(); + void dump() const; /** * Gets a subflow, i.e. a kernel in a machine @@ -266,17 +262,17 @@ namespace shogun } #ifndef SWIG - SG_FORCED_INLINE parameters_type get_parameters() + SG_FORCED_INLINE parameters_type get_parameters() const noexcept { return m_parameters; } - SG_FORCED_INLINE components_type get_components() + SG_FORCED_INLINE components_type get_components() const noexcept { return m_components; } - SG_FORCED_INLINE std::string get_class_name() + SG_FORCED_INLINE std::string get_class_name() const noexcept { return m_class_name; } @@ -320,7 +316,6 @@ namespace shogun param_descriptors, std::vector> param_qualities) - : m_name(name), m_description(description), m_data_format(data_format), m_dataset_id(dataset_id), m_version(version), m_creator(creator), @@ -441,8 +436,8 @@ namespace shogun std::shared_ptr data) : m_task_id(task_id), m_task_name(task_name), m_task_type(task_type), m_task_type_id(task_type_id), - m_evaluation_measures(evaluation_measures), m_split(split), - m_data(data) + m_evaluation_measures(std::move(evaluation_measures)), + m_split(std::move(split)), m_data(std::move(data)) { } @@ -459,9 +454,9 @@ namespace shogun return m_split; } - SGMatrix get_train_indices(); + SGMatrix get_train_indices() const; - SGMatrix get_test_indices(); + SGMatrix get_test_indices() const; #ifndef SWIG SG_FORCED_INLINE TaskType get_task_type() const noexcept @@ -553,9 +548,10 @@ namespace shogun m_fold_evaluations(std::move(fold_evaluations)), m_sample_evaluations(std::move(sample_evaluations)), m_data_content(data_content), - m_output_files(std::move(output_files)), m_task(task), - m_flow(flow), m_run_id(run_id), m_model(model), m_tags(tags), - m_predictions_url(predictions_url) + m_output_files(std::move(output_files)), m_task(std::move(task)), + m_flow(std::move(flow)), m_run_id(run_id), + m_model(std::move(model)), m_tags(std::move(tags)), + m_predictions_url(std::move(predictions_url)) { } @@ -591,6 +587,5 @@ namespace shogun std::string m_predictions_url; }; } // namespace shogun -#endif // HAVE_CURL #endif // SHOGUN_OPENMLFLOW_H From beb0acfcd63b12145b3bec3aa0deaad5411d3aa9 Mon Sep 17 00:00:00 2001 From: gf712 Date: Wed, 15 May 2019 16:21:42 +0100 Subject: [PATCH 08/32] added data getters --- src/shogun/io/OpenMLFlow.cpp | 205 ++++++++++++++++++++++++++++++++--- src/shogun/io/OpenMLFlow.h | 119 ++++++++++++++++---- 2 files changed, 292 insertions(+), 32 deletions(-) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index 590711823df..d2dcc4a56cd 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -6,10 +6,13 @@ #include #include +#include #include #ifdef HAVE_CURL +#include "OpenMLFlow.h" #include + #endif // HAVE_CURL using namespace shogun; @@ -39,6 +42,7 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) /* OpenML server format */ const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::download_server = ""; const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; /* DATA API */ @@ -58,7 +62,8 @@ const char* OpenMLReader::get_split = "/get/{}"; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, {"json", json_server}, - {"split", splits_server}}; + {"split", splits_server}, + {"download", download_server}}; const std::unordered_map OpenMLReader::m_request_options = { {"dataset_description", dataset_description}, @@ -298,7 +303,7 @@ std::shared_ptr OpenMLFlow::from_file() } std::shared_ptr -OpenMLData::get_data(const std::string& id, const std::string& api_key) +OpenMLData::get_dataset(const std::string& id, const std::string& api_key) { // description Document document; @@ -408,27 +413,189 @@ OpenMLData::get_data(const std::string& id, const std::string& api_key) default_target_attribute, row_id_attribute, ignore_attribute, version_label, citation, tags, visibility, original_data_url, paper_url, update_comment, md5_checksum, param_vector, qualities_vector); + result->set_api_key(api_key); + return result; +} +std::shared_ptr OpenMLData::get_features() noexcept +{ + if (!m_cached_features) + get_data(); + return m_cached_features; +} + +std::shared_ptr OpenMLData::get_features(const std::string& label) +{ + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label); + if (find_label == m_feature_names.end()) + SG_SERROR( + "Requested label \"%s\" not in the dataset!\n", label.c_str()) + if (!m_cached_features) + get_data(); + auto col_idx = std::distance(m_feature_names.begin(), find_label); + auto result = std::shared_ptr(m_cached_features->clone()->as()); + if (result->delete_feature_obj(col_idx)) + SG_SERROR("Error deleting the label column in CombinedFeatures!\n") return result; } -std::string OpenMLData::get_data_buffer(const std::string& api_key) +std::shared_ptr OpenMLData::get_labels() { - SG_SNOTIMPLEMENTED; + REQUIRE( + !m_default_target_attribute.empty(), + "A default target attribute is required if no label is given!\n") + return get_labels(m_default_target_attribute); +} + +std::shared_ptr OpenMLData::get_labels(const std::string& label_name) +{ + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label_name); + if (find_label == m_feature_names.end()) + SG_SERROR( + "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) + auto col_idx = std::distance(m_feature_names.begin(), find_label); + + if (!m_cached_features) + get_data(); + + auto target_label_as_feat = + std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); + + // TODO: replace with actual enum values + switch(m_feature_types[col_idx]) + { + // real features + case 0: + { + auto casted_feat = std::dynamic_pointer_cast>(target_label_as_feat); + auto labels_vec = casted_feat->get_feature_vector(0); + auto labels = std::make_shared(); + labels->set_values(labels_vec); + return labels; + } break; + // nominal features + case 1: + { + auto casted_feat = std::dynamic_pointer_cast>(target_label_as_feat); + auto labels_vec = casted_feat->get_feature_vector(0); + auto labels = std::make_shared(); + labels->set_values(labels_vec); + return labels; + } break; + default: + SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) + } + return nullptr; } +void OpenMLData::get_data() +{ + auto reader = OpenMLReader(m_api_key); + auto return_string = reader.get(m_url); + + // TODO: add ARFF parsing and don't forget feature names and feature types + m_cached_features = std::make_shared(); +} + std::shared_ptr OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) { - Document document; - auto reader = OpenMLReader(api_key); auto return_string = reader.get("get_split", "split", split_url); + + if (return_string == "Task not providing datasplits.") + return std::make_shared(); + auto return_stream = std::istringstream(return_string); - // add ARFF parsing here - SG_SNOTIMPLEMENTED - return nullptr; + // TODO: add ARFF parsing here + // get train/test indices + // TODO: replace line below with ARFFDeserialiser::get_features() + auto arff_features = std::make_shared(); + REQUIRE( + arff_features->get_num_feature_obj() == 4, + "Expected a ARFF file with 4 attributes: type, rowid, repeat and " + "fold.\n") + + auto train_test_feat = + std::shared_ptr(arff_features->get_feature_obj(0)); + auto rowid_feat = + std::shared_ptr(arff_features->get_feature_obj(1)); + auto repeat_feat = + std::shared_ptr(arff_features->get_feature_obj(2)); + auto fold_feat = + std::shared_ptr(arff_features->get_feature_obj(3)); + + auto type_vector = string_feature_to_vector(train_test_feat); + auto rowid_vector = dense_feature_to_vector(rowid_feat); + auto repeat_vector = dense_feature_to_vector(repeat_feat); + auto fold_vector = dense_feature_to_vector(fold_feat); + + std::vector> train_idx, test_idx; + for (int i = 0; i < arff_features->get_num_vectors(); ++i) + { + if (type_vector[i] == LabelType::TRAIN) + train_idx.emplace_back(std::initializer_list{ + static_cast(rowid_vector[i]), + static_cast(repeat_vector[i]), + static_cast(fold_vector[i])}); + else + test_idx.emplace_back(std::initializer_list{ + static_cast(rowid_vector[i]), + static_cast(repeat_vector[i]), + static_cast(fold_vector[i])}); + } + + return std::make_shared(train_idx, test_idx); +} + +SGVector +OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = + std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CDenseFeatures!\n>"); + return casted_feat->get_feature_vector(0); +} + +std::vector +OpenMLSplit::string_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CStringFeatures!\n"); + + auto to_lower = [](const std::string& line) { + std::string result; + std::transform( + line.begin(), line.end(), std::back_inserter(result), + [](uint8_t val) { return std::tolower(val); }); + return result; + }; + + std::vector result; + + for (int i = 0; i < casted_feat->get_num_vectors(); ++i) + { + auto row = casted_feat->get_feature_vector(i); + std::string label(1, row[0]); + for (auto j = 1; j < casted_feat->get_max_vector_length(); ++j) + label.append(1, row[j]); + if (to_lower(label) == "train") + result.push_back(LabelType::TRAIN); + else if (to_lower(label) == "test") + result.push_back(LabelType::TEST); + else + SG_SERROR("Unknown label type in split file %s!\n", label.c_str()) + } + return result; } std::shared_ptr @@ -473,7 +640,7 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) std::string dataset_id = dataset_info["data_set_id"].GetString(); std::string target_feature = dataset_info["target_feature"].GetString(); - openml_dataset = OpenMLData::get_data(dataset_id, api_key); + openml_dataset = OpenMLData::get_dataset(dataset_id, api_key); } else if ( strcmp(task_settings["name"].GetString(), "estimation_procedure") == @@ -496,8 +663,11 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) "Unexpected number of parameters in parameter array " "of estimation_procedure.\n") } - openml_split = std::make_shared( - split_id, split_type, split_url, split_parameters); + REQUIRE( + split_type == "crossvalidation", + "Currently only tasks with cross validation are enabled in " + "shogun!\n") + openml_split = OpenMLSplit::get_split(split_url, api_key); } else if ( strcmp(task_settings["name"].GetString(), "evaluation_measures") == @@ -877,7 +1047,16 @@ std::shared_ptr OpenMLRun::run_flow_on_task( std::shared_ptr flow, std::shared_ptr task) { auto data = task->get_dataset(); - SG_SNOTIMPLEMENTED + std::shared_ptr train_features, test_features; + std::shared_ptr train_labels, test_labels; + + if (task->get_split()->contains_splits()) + SG_SNOTIMPLEMENTED + else + { + auto labels = data->get_labels(); + auto feat = data->get_features(); + } return std::shared_ptr(); } diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 45086d44e94..1ae2691be9a 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -48,10 +49,7 @@ namespace shogun std::string get(const std::string& request, const std::string& format, Args... args) { -#ifdef HAVE_CURL std::string request_path; - // clear the buffer before request - m_curl_response_buffer.clear(); auto find_format = m_format_options.find(format); if (find_format == m_format_options.end()) { @@ -107,11 +105,20 @@ namespace shogun std::string url = request_format + request_path + "?" + m_api_key; - openml_curl_request_helper(url); + return get(url); + } + std::string get(const std::string& url) + { +#ifdef HAVE_CURL + // clear the buffer before request + m_curl_response_buffer.clear(); + + openml_curl_request_helper(url); return m_curl_response_buffer; #else - SG_SERROR("This function is only available witht the CURL library!\n") + SG_SERROR( + "This function is only available with the CURL library!\n") #endif // HAVE_CURL } @@ -134,6 +141,8 @@ namespace shogun static const char* xml_server; /** the server path to get a response in JSON format*/ static const char* json_server; + /** the server path to download datasets */ + static const char* download_server; /** the server path to get a split in ARFF format */ static const char* splits_server; @@ -339,16 +348,53 @@ namespace shogun * */ static std::shared_ptr - get_data(const std::string& id, const std::string& api_key); + get_dataset(const std::string& id, const std::string& api_key); /** - * Returns the dataset - * @param api_key + * Returns ALL the features of the dataset, potentially also the labels column + * @return the features + */ + std::shared_ptr get_features() noexcept; + + /** + * Returns the dataset features + * @param label_name the name of the attribute containing the label + * @return the features + */ + std::shared_ptr get_features(const std::string& label_name); + + /** + * Returns the dataset labels if m_default_target_attribute is not empty + * @return the labels + */ + std::shared_ptr get_labels(); + + /** + * Returns the dataset labels given the label_name + * @return the labels + */ + std::shared_ptr get_labels(const std::string& label_name); + + /** + * Returns the type of all attributes/features in the ARFF file * @return */ - std::string get_data_buffer(const std::string& api_key); + // TODO: replace with actual enum values + SG_FORCED_INLINE std::vector get_feature_types() const noexcept + { + return m_feature_types; + } + + protected: + SG_FORCED_INLINE void set_api_key(const std::string& api_key) noexcept + { + m_api_key = api_key; + } private: + + void get_data(); + std::string m_name; std::string m_description; std::string m_data_format; @@ -376,6 +422,11 @@ namespace shogun m_param_descriptors; std::vector> m_param_qualities; + std::string m_api_key; + + std::shared_ptr m_cached_features; + std::vector m_feature_names; + std::vector m_feature_types; // TODO: replace int with type enum }; /** @@ -384,24 +435,54 @@ namespace shogun class OpenMLSplit { public: + enum class LabelType + { + TRAIN = 1, + TEST = 2 + }; + + /** + * Default constructor. This is used when there are no + * train or test indices. + */ + OpenMLSplit() = default; + OpenMLSplit( - const std::string& split_id, const std::string& split_type, - const std::string& split_url, - const std::unordered_map& - split_parameters) - : m_split_id(split_id), m_split_type(split_type), - m_split_url(split_url), m_parameters(split_parameters) + std::vector> train_idx, + std::vector> test_idx) + : m_train_idx(std::move(train_idx)), m_test_idx(std::move(test_idx)) { } static std::shared_ptr get_split(const std::string& split_url, const std::string& api_key); + SG_FORCED_INLINE std::vector> get_train_idx() const + noexcept + { + return m_train_idx; + } + + SG_FORCED_INLINE std::vector> get_test_idx() const + noexcept + { + return m_test_idx; + } + + SG_FORCED_INLINE bool contains_splits() const noexcept + { + return !m_train_idx.empty() && !m_test_idx.empty(); + } + private: - std::string m_split_id; - std::string m_split_type; - std::string m_split_url; - std::unordered_map m_parameters; + static SGVector + dense_feature_to_vector(const std::shared_ptr& feat); + + static std::vector + string_feature_to_vector(const std::shared_ptr& feat); + + std::vector> m_train_idx; + std::vector> m_test_idx; }; /** From 86c06b82bf08ec194fe0667d4caef22d2b0115f4 Mon Sep 17 00:00:00 2001 From: gf712 Date: Fri, 17 May 2019 10:14:32 +0100 Subject: [PATCH 09/32] get binary classification working [SKIP CI] --- src/shogun/features/CombinedFeatures.cpp | 24 ++-- src/shogun/features/CombinedFeatures.h | 2 +- src/shogun/io/OpenMLFlow.cpp | 156 +++++++++++++++-------- src/shogun/io/OpenMLFlow.h | 33 +++-- 4 files changed, 140 insertions(+), 75 deletions(-) diff --git a/src/shogun/features/CombinedFeatures.cpp b/src/shogun/features/CombinedFeatures.cpp index 6dd4049acb1..9237ea11206 100644 --- a/src/shogun/features/CombinedFeatures.cpp +++ b/src/shogun/features/CombinedFeatures.cpp @@ -34,7 +34,6 @@ CFeatures* CCombinedFeatures::duplicate() const CCombinedFeatures::~CCombinedFeatures() { - SG_UNREF(feature_array); } CFeatures* CCombinedFeatures::get_feature_obj(int32_t idx) const @@ -42,7 +41,7 @@ CFeatures* CCombinedFeatures::get_feature_obj(int32_t idx) const REQUIRE( idx < get_num_feature_obj() && idx>=0, "Feature index (%d) must be within [%d, %d]", idx, 0, get_num_feature_obj()-1); - return (CFeatures*) feature_array->get_element(idx); + return feature_array.at(idx); } void CCombinedFeatures::list_feature_objs() const @@ -129,7 +128,8 @@ bool CCombinedFeatures::insert_feature_obj(CFeatures* obj, int32_t idx) } num_vec=n; - return feature_array->insert_element(obj, idx); + feature_array.insert(feature_array.begin()+idx, obj); + return true; } bool CCombinedFeatures::append_feature_obj(CFeatures* obj) @@ -146,27 +146,25 @@ bool CCombinedFeatures::append_feature_obj(CFeatures* obj) num_vec=n; int num_feature_obj = get_num_feature_obj(); - feature_array->push_back(obj); - return num_feature_obj+1 == feature_array->get_num_elements(); + feature_array.push_back(obj); + return num_feature_obj+1 == feature_array.size(); } bool CCombinedFeatures::delete_feature_obj(int32_t idx) { - return feature_array->delete_element(idx); + feature_array.erase(feature_array.begin()+idx); + return true; } int32_t CCombinedFeatures::get_num_feature_obj() const { - return feature_array->get_num_elements(); + return feature_array.size(); } void CCombinedFeatures::init() { - feature_array = new CDynamicObjectArray(); - SG_REF(feature_array); - SG_ADD(&num_vec, "num_vec", "Number of vectors."); - SG_ADD(&feature_array, "feature_array", "Feature array."); + watch_param("feature_array", &feature_array); } CFeatures* CCombinedFeatures::create_merged_copy(CFeatures* other) const @@ -299,10 +297,10 @@ void CCombinedFeatures::remove_all_subsets() CFeatures* CCombinedFeatures::copy_subset(SGVector indices) { /* this is returned with the results of copy_subset of sub-features */ - CCombinedFeatures* result=new CCombinedFeatures(); + auto* result=new CCombinedFeatures(); /* map to only copy same feature objects once */ - CMap* processed=new CMap(); + auto* processed=new CMap(); for (index_t f_idx=0; f_idx feature_array; /** number of vectors * must match between sub features diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index d2dcc4a56cd..73894f73516 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -5,14 +5,12 @@ */ #include -#include #include +#include #include #ifdef HAVE_CURL -#include "OpenMLFlow.h" #include - #endif // HAVE_CURL using namespace shogun; @@ -417,31 +415,54 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) return result; } -std::shared_ptr OpenMLData::get_features() noexcept +std::shared_ptr OpenMLData::get_features() noexcept { if (!m_cached_features) get_data(); return m_cached_features; } -std::shared_ptr OpenMLData::get_features(const std::string& label) +std::shared_ptr OpenMLData::get_features(const std::string& label) { + if (!m_cached_features) + get_data(); auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label); + std::find(m_feature_names.begin(), m_feature_names.end(), label); if (find_label == m_feature_names.end()) - SG_SERROR( - "Requested label \"%s\" not in the dataset!\n", label.c_str()) + SG_SERROR("Requested label \"%s\" not in the dataset!\n", label.c_str()) if (!m_cached_features) get_data(); auto col_idx = std::distance(m_feature_names.begin(), find_label); - auto result = std::shared_ptr(m_cached_features->clone()->as()); - if (result->delete_feature_obj(col_idx)) - SG_SERROR("Error deleting the label column in CombinedFeatures!\n") + auto feat_type_copy = m_feature_types; + feat_type_copy.erase(feat_type_copy.begin() + col_idx); + for (const auto type : feat_type_copy) + { + if (type == ARFFDeserializer::Attribute::STRING) + SG_SERROR("Currently cannot process string features!\n") + } + std::shared_ptr result; + bool first = true; + for (int i = 0; i < m_feature_types.size(); ++i) + { + if (i != col_idx && first) + { + result.reset(m_cached_features->get_feature_obj(i)); + first = false; + } + if (i != col_idx) + result.reset(result->create_merged_copy( + m_cached_features->get_feature_obj(i))); + } + std::dynamic_pointer_cast>(result)->set_num_features(m_feature_types.size()); + std::dynamic_pointer_cast>(result)->set_num_vectors(m_cached_features->get_num_vectors()); + return result; } std::shared_ptr OpenMLData::get_labels() { + if (!m_cached_features) + get_data(); REQUIRE( !m_default_target_attribute.empty(), "A default target attribute is required if no label is given!\n") @@ -450,6 +471,8 @@ std::shared_ptr OpenMLData::get_labels() std::shared_ptr OpenMLData::get_labels(const std::string& label_name) { + if (!m_cached_features) + get_data(); auto find_label = std::find(m_feature_names.begin(), m_feature_names.end(), label_name); if (find_label == m_feature_names.end()) @@ -463,29 +486,38 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) auto target_label_as_feat = std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); - // TODO: replace with actual enum values - switch(m_feature_types[col_idx]) + switch (m_feature_types[col_idx]) { - // real features - case 0: - { - auto casted_feat = std::dynamic_pointer_cast>(target_label_as_feat); - auto labels_vec = casted_feat->get_feature_vector(0); - auto labels = std::make_shared(); - labels->set_values(labels_vec); - return labels; - } break; - // nominal features - case 1: + // real features + case ARFFDeserializer::Attribute::REAL: + case ARFFDeserializer::Attribute::NUMERIC: + case ARFFDeserializer::Attribute::INTEGER: + case ARFFDeserializer::Attribute::DATE: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + auto labels = std::make_shared(labels_vec); + return labels; + } + break; + // nominal features + case ARFFDeserializer::Attribute::NOMINAL: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + for(auto& val: labels_vec) { - auto casted_feat = std::dynamic_pointer_cast>(target_label_as_feat); - auto labels_vec = casted_feat->get_feature_vector(0); - auto labels = std::make_shared(); - labels->set_values(labels_vec); - return labels; - } break; - default: - SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) + if (val == 0) + val = -1; + } + auto labels = std::make_shared(labels_vec); + return labels; + } + break; + default: + SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) } return nullptr; @@ -494,10 +526,14 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) void OpenMLData::get_data() { auto reader = OpenMLReader(m_api_key); - auto return_string = reader.get(m_url); - - // TODO: add ARFF parsing and don't forget feature names and feature types - m_cached_features = std::make_shared(); + std::shared_ptr ss = + std::make_shared(reader.get(m_url)); + + auto parser = ARFFDeserializer(ss); + parser.read(); + m_cached_features = parser.get_features(); + m_feature_names = parser.get_feature_names(); + m_feature_types = parser.get_attribute_types(); } std::shared_ptr @@ -677,14 +713,17 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) task_settings["evaluation_measures"].GetObject(); for (const auto& param : evaluation_info) { - evaluation_measures.emplace( - param.name.GetString(), param.value.GetString()); + if (param.value.IsString()) + evaluation_measures.emplace( + param.name.GetString(), param.value.GetString()); + else + evaluation_measures.emplace(param.name.GetString(), ""); } } } if (openml_dataset == nullptr && openml_split == nullptr) - SG_SERROR("Error parsing task.") + SG_SERROR("Error parsing task.\n") auto result = std::make_shared( task_id, task_name, task_type, task_type_id, evaluation_measures, @@ -996,11 +1035,12 @@ ShogunOpenML::get_class_info(const std::string& class_name) return result; } -CLabels* ShogunOpenML::run_model_on_fold( +std::shared_ptr ShogunOpenML::run_model_on_fold( const std::shared_ptr& model, - const std::shared_ptr& task, CFeatures* X_train, - index_t repeat_number, index_t fold_number, CLabels* y_train, - CFeatures* X_test) + const std::shared_ptr& task, + const std::shared_ptr& X_train, index_t repeat_number, + index_t fold_number, const std::shared_ptr& y_train, + const std::shared_ptr& X_test) { auto task_type = task->get_task_type(); auto model_clone = std::shared_ptr(model->clone()); @@ -1012,9 +1052,14 @@ CLabels* ShogunOpenML::run_model_on_fold( { if (auto machine = std::dynamic_pointer_cast(model_clone)) { - machine->put("labels", y_train); - machine->train(X_train); - return machine->apply(X_test); + machine->put("labels", y_train.get()); + auto tmp = X_train.get(); + machine->train(tmp); + delete tmp; + if (X_test) + return std::shared_ptr(machine->apply(X_test.get())); + else + return std::shared_ptr(machine->apply(X_train.get())); } else SG_SERROR("The provided model is not a trainable machine!\n") @@ -1047,15 +1092,26 @@ std::shared_ptr OpenMLRun::run_flow_on_task( std::shared_ptr flow, std::shared_ptr task) { auto data = task->get_dataset(); - std::shared_ptr train_features, test_features; - std::shared_ptr train_labels, test_labels; + std::shared_ptr train_features, test_features = nullptr; + std::shared_ptr train_labels, test_labels = nullptr; if (task->get_split()->contains_splits()) SG_SNOTIMPLEMENTED else { - auto labels = data->get_labels(); - auto feat = data->get_features(); + train_labels = data->get_labels(); + train_features = + data->get_features(data->get_default_target_attribute()); + auto model = ShogunOpenML::flow_to_model(std::move(flow), true); + + if (auto machine = std::dynamic_pointer_cast(model)) + { + auto result = ShogunOpenML::run_model_on_fold( + machine, task, train_features, 0, 0, train_labels, + test_features); + } + else + SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") } return std::shared_ptr(); } diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 1ae2691be9a..28d22fcf0d5 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -351,17 +352,19 @@ namespace shogun get_dataset(const std::string& id, const std::string& api_key); /** - * Returns ALL the features of the dataset, potentially also the labels column + * Returns ALL the features of the dataset, potentially also the labels + * column * @return the features */ - std::shared_ptr get_features() noexcept; + std::shared_ptr get_features() noexcept; /** * Returns the dataset features * @param label_name the name of the attribute containing the label * @return the features */ - std::shared_ptr get_features(const std::string& label_name); + std::shared_ptr + get_features(const std::string& label_name); /** * Returns the dataset labels if m_default_target_attribute is not empty @@ -379,12 +382,17 @@ namespace shogun * Returns the type of all attributes/features in the ARFF file * @return */ - // TODO: replace with actual enum values - SG_FORCED_INLINE std::vector get_feature_types() const noexcept + SG_FORCED_INLINE std::vector + get_feature_types() const noexcept { return m_feature_types; } + SG_FORCED_INLINE std::string get_default_target_attribute() const noexcept + { + return m_default_target_attribute; + } + protected: SG_FORCED_INLINE void set_api_key(const std::string& api_key) noexcept { @@ -392,7 +400,6 @@ namespace shogun } private: - void get_data(); std::string m_name; @@ -426,7 +433,7 @@ namespace shogun std::shared_ptr m_cached_features; std::vector m_feature_names; - std::vector m_feature_types; // TODO: replace int with type enum + std::vector m_feature_types; }; /** @@ -558,6 +565,7 @@ namespace shogun std::shared_ptr m_data; }; + class OpenMLRun; /** * The Shogun OpenML extension to run models from an OpenMLFlow * and convert models to OpenMLFlow. @@ -565,6 +573,7 @@ namespace shogun class ShogunOpenML { public: + friend class OpenMLRun; /** * Instantiates a SGObject from an OpenMLFlow. * @@ -586,11 +595,13 @@ namespace shogun model_to_flow(const std::shared_ptr& model); protected: - CLabels* run_model_on_fold( + static std::shared_ptr run_model_on_fold( const std::shared_ptr& model, - const std::shared_ptr& task, CFeatures* X_train, - index_t repeat_number, index_t fold_number, CLabels* y_train, - CFeatures* X_test); + const std::shared_ptr& task, + const std::shared_ptr& X_train, + index_t repeat_number, index_t fold_number, + const std::shared_ptr& y_train, + const std::shared_ptr& X_test); private: /** From d40785a95fc6a52089601f14beca6ad22e84f172 Mon Sep 17 00:00:00 2001 From: gf712 Date: Fri, 17 May 2019 16:00:29 +0100 Subject: [PATCH 10/32] smart pointer and refcount fixes --- src/shogun/io/OpenMLFlow.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index 73894f73516..018351861c1 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -1052,10 +1052,10 @@ std::shared_ptr ShogunOpenML::run_model_on_fold( { if (auto machine = std::dynamic_pointer_cast(model_clone)) { - machine->put("labels", y_train.get()); + // TODO: refactor. more useless clones until smart pointers are merged + machine->put("labels", y_train->clone()->as()); auto tmp = X_train.get(); machine->train(tmp); - delete tmp; if (X_test) return std::shared_ptr(machine->apply(X_test.get())); else @@ -1092,8 +1092,8 @@ std::shared_ptr OpenMLRun::run_flow_on_task( std::shared_ptr flow, std::shared_ptr task) { auto data = task->get_dataset(); - std::shared_ptr train_features, test_features = nullptr; - std::shared_ptr train_labels, test_labels = nullptr; + std::shared_ptr train_features = nullptr, test_features = nullptr; + std::shared_ptr train_labels = nullptr, test_labels = nullptr; if (task->get_split()->contains_splits()) SG_SNOTIMPLEMENTED @@ -1102,6 +1102,9 @@ std::shared_ptr OpenMLRun::run_flow_on_task( train_labels = data->get_labels(); train_features = data->get_features(data->get_default_target_attribute()); + // ensures delete is called by shared ptr destructor + SG_REF(train_labels.get()) + SG_REF(train_features.get()) auto model = ShogunOpenML::flow_to_model(std::move(flow), true); if (auto machine = std::dynamic_pointer_cast(model)) @@ -1109,6 +1112,7 @@ std::shared_ptr OpenMLRun::run_flow_on_task( auto result = ShogunOpenML::run_model_on_fold( machine, task, train_features, 0, 0, train_labels, test_features); + SG_SDEBUG(result->to_string().c_str()); } else SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") From 7b4a36540932d9243955fde64396de0077f843da Mon Sep 17 00:00:00 2001 From: gf712 Date: Sat, 18 May 2019 11:55:10 +0100 Subject: [PATCH 11/32] split openml classes in different files --- src/interfaces/swig/IO.i | 6 +- src/interfaces/swig/IO_includes.i | 4 +- src/shogun/io/OpenMLFlow.cpp | 1138 ------------------------- src/shogun/io/OpenMLFlow.h | 683 --------------- src/shogun/io/openml/OpenMLData.cpp | 255 ++++++ src/shogun/io/openml/OpenMLData.h | 162 ++++ src/shogun/io/openml/OpenMLFlow.cpp | 115 +++ src/shogun/io/openml/OpenMLFlow.h | 142 +++ src/shogun/io/openml/OpenMLReader.cpp | 97 +++ src/shogun/io/openml/OpenMLReader.h | 183 ++++ src/shogun/io/openml/OpenMLRun.cpp | 68 ++ src/shogun/io/openml/OpenMLRun.h | 78 ++ src/shogun/io/openml/OpenMLSplit.cpp | 112 +++ src/shogun/io/openml/OpenMLSplit.h | 71 ++ src/shogun/io/openml/OpenMLTask.cpp | 137 +++ src/shogun/io/openml/OpenMLTask.h | 95 +++ src/shogun/io/openml/ShogunOpenML.cpp | 340 ++++++++ src/shogun/io/openml/ShogunOpenML.h | 70 ++ src/shogun/io/openml/utils.h | 131 +++ 19 files changed, 2064 insertions(+), 1823 deletions(-) delete mode 100644 src/shogun/io/OpenMLFlow.cpp delete mode 100644 src/shogun/io/OpenMLFlow.h create mode 100644 src/shogun/io/openml/OpenMLData.cpp create mode 100644 src/shogun/io/openml/OpenMLData.h create mode 100644 src/shogun/io/openml/OpenMLFlow.cpp create mode 100644 src/shogun/io/openml/OpenMLFlow.h create mode 100644 src/shogun/io/openml/OpenMLReader.cpp create mode 100644 src/shogun/io/openml/OpenMLReader.h create mode 100644 src/shogun/io/openml/OpenMLRun.cpp create mode 100644 src/shogun/io/openml/OpenMLRun.h create mode 100644 src/shogun/io/openml/OpenMLSplit.cpp create mode 100644 src/shogun/io/openml/OpenMLSplit.h create mode 100644 src/shogun/io/openml/OpenMLTask.cpp create mode 100644 src/shogun/io/openml/OpenMLTask.h create mode 100644 src/shogun/io/openml/ShogunOpenML.cpp create mode 100644 src/shogun/io/openml/ShogunOpenML.h create mode 100644 src/shogun/io/openml/utils.h diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index 4e59008adf4..36ee3d32f26 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -26,6 +26,8 @@ %rename(MemoryMappedFile) CMemoryMappedFile; %shared_ptr(shogun::OpenMLFlow) +%shared_ptr(shogun::OpenMLRun) +%shared_ptr(shogun::OpenMLTask) %include %include @@ -121,7 +123,9 @@ namespace shogun %include %include -%include +%include +%include +%include %include %include %include diff --git a/src/interfaces/swig/IO_includes.i b/src/interfaces/swig/IO_includes.i index 89401074d62..6d26232a513 100644 --- a/src/interfaces/swig/IO_includes.i +++ b/src/interfaces/swig/IO_includes.i @@ -13,7 +13,9 @@ #include #include -#include +#include +#include +#include #include #include #include diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp deleted file mode 100644 index 018351861c1..00000000000 --- a/src/shogun/io/OpenMLFlow.cpp +++ /dev/null @@ -1,1138 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Gil Hoben - */ - -#include -#include -#include - -#include -#ifdef HAVE_CURL -#include -#endif // HAVE_CURL - -using namespace shogun; -using namespace rapidjson; - -/** - * The writer callback function used to write the packets to a C++ string. - * @param data the data received in CURL request - * @param size always 1 - * @param nmemb the size of data - * @param buffer_in the buffer to write to - * @return the size of buffer that was written - */ -size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) -{ - // check that the buffer string points to something - if (buffer_in != nullptr) - { - // Append the data to the buffer - buffer_in->append(data, size * nmemb); - - return size * nmemb; - } - return 0; -} - -/* OpenML server format */ -const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; -const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; -const char* OpenMLReader::download_server = ""; -const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; - -/* DATA API */ -const char* OpenMLReader::dataset_description = "/data/{}"; -const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; -const char* OpenMLReader::data_features = "/data/features/{}"; -const char* OpenMLReader::data_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; -/* FLOW API */ -const char* OpenMLReader::flow_file = "/flow/{}"; -/* TASK API */ -const char* OpenMLReader::task_file = "/task/{}"; -/* SPLIT API */ -const char* OpenMLReader::get_split = "/get/{}"; - -const std::unordered_map - OpenMLReader::m_format_options = {{"xml", xml_server}, - {"json", json_server}, - {"split", splits_server}, - {"download", download_server}}; -const std::unordered_map - OpenMLReader::m_request_options = { - {"dataset_description", dataset_description}, - {"list_data_qualities", list_data_qualities}, - {"data_features", data_features}, - {"data_qualities", data_qualities}, - {"list_dataset_qualities", list_dataset_qualities}, - {"list_dataset_filter", list_dataset_filter}, - {"flow_file", flow_file}, - {"task_file", task_file}}; - -void OpenMLReader::openml_curl_request_helper(const std::string& url) -{ -#ifdef HAVE_CURL - CURL* curl_handle = nullptr; - - curl_handle = curl_easy_init(); - - if (!curl_handle) - { - SG_SERROR("Failed to initialise curl handle.\n") - return; - } - - curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); - curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); - curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); - - CURLcode res = curl_easy_perform(curl_handle); - - if (res != CURLE_OK) - SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) - - curl_easy_cleanup(curl_handle); -#endif // HAVE_CURL -} - -/** - * Checks the returned response from OpenML in JSON format - * @param doc the parsed OpenML JSON format response - */ -static void check_response(const Document& doc, const std::string& type) -{ - if (SG_UNLIKELY(doc.HasMember("error"))) - { - const Value& root = doc["error"]; - SG_SERROR( - "Server error %s: %s\n", root["code"].GetString(), - root["message"].GetString()) - return; - } - REQUIRE( - doc.HasMember(type.c_str()), "Unexpected format of OpenML %s.\n", - type.c_str()); -} - -/** - * Helper function to add JSON objects as string in map - * @param v a RapidJSON GenericValue, i.e. string - * @param param_dict the map to write to - * @param name the name of the key - */ -static SG_FORCED_INLINE void emplace_string_to_map( - const GenericValue>& v, - std::unordered_map& param_dict, - const std::string& name) -{ - if (v[name.c_str()].GetType() == Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); -} - -/** - * Helper function to add JSON objects as string in map - * @param v a RapidJSON GenericObject, i.e. array - * @param param_dict the map to write to - * @param name the name of the key - */ -static SG_FORCED_INLINE void emplace_string_to_map( - const GenericObject>>& v, - std::unordered_map& param_dict, - const std::string& name) -{ - if (v[name.c_str()].GetType() == Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); -} - -template -SG_FORCED_INLINE T return_if_possible( - const std::string& name, - const GenericObject>>& v) -{ - SG_SNOTIMPLEMENTED -} - -template <> -SG_FORCED_INLINE std::string return_if_possible( - const std::string& name, - const GenericObject>>& v) -{ - if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) - return v[name.c_str()].GetString(); - if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) - SG_SERROR("Found member \"%s\" but it is not a string", name.c_str()) - if (!v.HasMember(name.c_str())) - return ""; - SG_SERROR("\"%s\" is not a member of the given object", name.c_str()) - return nullptr; -} - -template <> -SG_FORCED_INLINE std::vector -return_if_possible>( - const std::string& name, - const GenericObject>>& v) -{ - std::vector result; - if (!v.HasMember(name.c_str())) - SG_SERROR("\"%s\" is not a member of the given object", name.c_str()) - if (v[name.c_str()].IsString()) - { - result.emplace_back(v[name.c_str()].GetString()); - } - if (v[name.c_str()].IsArray()) - { - for (const auto& val : v[name.c_str()].GetArray()) - { - if (val.IsString()) - result.emplace_back(val.GetString()); - else - SG_SERROR("Found non string member in \"%s\".\n", name.c_str()) - } - } - return result; -} - -std::shared_ptr OpenMLFlow::download_flow( - const std::string& flow_id, const std::string& api_key) -{ - Document document; - parameters_type params; - components_type components; - std::string name; - std::string description; - std::string class_name; - - // get flow and parse with RapidJSON - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("flow_file", "json", flow_id); - document.Parse(return_string.c_str()); - check_response(document, "flow"); - - // store root for convenience. We know it exists from previous check. - const Value& root = document["flow"]; - - // handle parameters - if (root.HasMember("parameter")) - { - std::unordered_map param_dict; - - if (root["parameter"].IsArray()) - { - for (const auto& v : root["parameter"].GetArray()) - { - emplace_string_to_map(v, param_dict, "data_type"); - emplace_string_to_map(v, param_dict, "default_value"); - emplace_string_to_map(v, param_dict, "description"); - params.emplace(v["name"].GetString(), param_dict); - param_dict.clear(); - } - } - else - { - // parameter can also be a dict, instead of array - const auto v = root["parameter"].GetObject(); - emplace_string_to_map(v, param_dict, "data_type"); - emplace_string_to_map(v, param_dict, "default_value"); - emplace_string_to_map(v, param_dict, "description"); - params.emplace(v["name"].GetString(), param_dict); - } - } - - // handle components, i.e. kernels - if (root.HasMember("component")) - { - if (root["component"].IsArray()) - { - for (const auto& v : root["component"].GetArray()) - { - components.emplace( - v["identifier"].GetString(), - OpenMLFlow::download_flow( - v["flow"]["id"].GetString(), api_key)); - } - } - else - { - components.emplace( - root["component"]["identifier"].GetString(), - OpenMLFlow::download_flow( - root["component"]["flow"]["id"].GetString(), api_key)); - } - } - - // get remaining information from flow - if (root.HasMember("name")) - name = root["name"].GetString(); - if (root.HasMember("description")) - description = root["description"].GetString(); - if (root.HasMember("class_name")) - class_name = root["class_name"].GetString(); - - auto flow = std::make_shared( - name, description, class_name, components, params); - - return flow; -} - -void OpenMLFlow::upload_flow(const std::shared_ptr& flow) -{ - SG_SNOTIMPLEMENTED; -} - -void OpenMLFlow::dump() const -{ - SG_SNOTIMPLEMENTED; -} - -std::shared_ptr OpenMLFlow::from_file() -{ - SG_SNOTIMPLEMENTED; - return std::shared_ptr(); -} - -std::shared_ptr -OpenMLData::get_dataset(const std::string& id, const std::string& api_key) -{ - // description - Document document; - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("dataset_description", "json", id); - - document.Parse(return_string.c_str()); - check_response(document, "data_set_description"); - - const Value& dataset_description = document["data_set_description"]; - - auto name = return_if_possible( - "name", dataset_description.GetObject()); - auto description = return_if_possible( - "description", dataset_description.GetObject()); - auto data_format = return_if_possible( - "data_format", dataset_description.GetObject()); - auto dataset_id = - return_if_possible("id", dataset_description.GetObject()); - auto version = return_if_possible( - "version", dataset_description.GetObject()); - auto creator = return_if_possible( - "creator", dataset_description.GetObject()); - auto contributor = return_if_possible( - "contributor", dataset_description.GetObject()); - auto collection_date = return_if_possible( - "collection_date", dataset_description.GetObject()); - auto upload_date = return_if_possible( - "upload_date", dataset_description.GetObject()); - auto language = return_if_possible( - "language", dataset_description.GetObject()); - auto licence = return_if_possible( - "licence", dataset_description.GetObject()); - auto url = - return_if_possible("url", dataset_description.GetObject()); - auto default_target_attribute = return_if_possible( - "default_target_attribute", dataset_description.GetObject()); - auto row_id_attribute = return_if_possible( - "row_id_attribute", dataset_description.GetObject()); - auto ignore_attribute = return_if_possible( - "ignore_attribute", dataset_description.GetObject()); - auto version_label = return_if_possible( - "version_label", dataset_description.GetObject()); - auto citation = return_if_possible( - "citation", dataset_description.GetObject()); - auto tags = return_if_possible>( - "tag", dataset_description.GetObject()); - auto visibility = return_if_possible( - "visibility", dataset_description.GetObject()); - auto original_data_url = return_if_possible( - "original_data_url", dataset_description.GetObject()); - auto paper_url = return_if_possible( - "paper_url", dataset_description.GetObject()); - auto update_comment = return_if_possible( - "update_comment", dataset_description.GetObject()); - auto md5_checksum = return_if_possible( - "md5_checksum", dataset_description.GetObject()); - - // features - std::vector>> - param_vector; - return_string = reader.get("data_features", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_features"); - const Value& dataset_features = document["data_features"]; - for (const auto& param : dataset_features["feature"].GetArray()) - { - std::unordered_map> param_map; - for (const auto& param_descriptors : param.GetObject()) - { - std::vector second; - if (param_descriptors.value.IsArray()) - for (const auto& v : param_descriptors.value.GetArray()) - second.emplace_back(v.GetString()); - else - second.emplace_back(param_descriptors.value.GetString()); - - param_map.emplace(param_descriptors.name.GetString(), second); - } - param_vector.push_back(param_map); - } - - // qualities - std::vector> qualities_vector; - return_string = reader.get("data_qualities", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_qualities"); - const Value& data_qualities = document["data_qualities"]; - for (const auto& param : data_qualities["quality"].GetArray()) - { - std::unordered_map param_map; - for (const auto& param_quality : param.GetObject()) - { - if (param_quality.name.IsString() && param_quality.value.IsString()) - param_map.emplace( - param_quality.name.GetString(), - param_quality.value.GetString()); - else if (param_quality.name.IsString()) - param_map.emplace(param_quality.name.GetString(), ""); - } - qualities_vector.push_back(param_map); - } - - auto result = std::make_shared( - name, description, data_format, dataset_id, version, creator, - contributor, collection_date, upload_date, language, licence, url, - default_target_attribute, row_id_attribute, ignore_attribute, - version_label, citation, tags, visibility, original_data_url, paper_url, - update_comment, md5_checksum, param_vector, qualities_vector); - result->set_api_key(api_key); - return result; -} - -std::shared_ptr OpenMLData::get_features() noexcept -{ - if (!m_cached_features) - get_data(); - return m_cached_features; -} - -std::shared_ptr OpenMLData::get_features(const std::string& label) -{ - if (!m_cached_features) - get_data(); - auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label); - if (find_label == m_feature_names.end()) - SG_SERROR("Requested label \"%s\" not in the dataset!\n", label.c_str()) - if (!m_cached_features) - get_data(); - auto col_idx = std::distance(m_feature_names.begin(), find_label); - auto feat_type_copy = m_feature_types; - feat_type_copy.erase(feat_type_copy.begin() + col_idx); - for (const auto type : feat_type_copy) - { - if (type == ARFFDeserializer::Attribute::STRING) - SG_SERROR("Currently cannot process string features!\n") - } - std::shared_ptr result; - bool first = true; - for (int i = 0; i < m_feature_types.size(); ++i) - { - if (i != col_idx && first) - { - result.reset(m_cached_features->get_feature_obj(i)); - first = false; - } - if (i != col_idx) - result.reset(result->create_merged_copy( - m_cached_features->get_feature_obj(i))); - } - std::dynamic_pointer_cast>(result)->set_num_features(m_feature_types.size()); - std::dynamic_pointer_cast>(result)->set_num_vectors(m_cached_features->get_num_vectors()); - - return result; -} - -std::shared_ptr OpenMLData::get_labels() -{ - if (!m_cached_features) - get_data(); - REQUIRE( - !m_default_target_attribute.empty(), - "A default target attribute is required if no label is given!\n") - return get_labels(m_default_target_attribute); -} - -std::shared_ptr OpenMLData::get_labels(const std::string& label_name) -{ - if (!m_cached_features) - get_data(); - auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label_name); - if (find_label == m_feature_names.end()) - SG_SERROR( - "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) - auto col_idx = std::distance(m_feature_names.begin(), find_label); - - if (!m_cached_features) - get_data(); - - auto target_label_as_feat = - std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); - - switch (m_feature_types[col_idx]) - { - // real features - case ARFFDeserializer::Attribute::REAL: - case ARFFDeserializer::Attribute::NUMERIC: - case ARFFDeserializer::Attribute::INTEGER: - case ARFFDeserializer::Attribute::DATE: - { - auto casted_feat = std::dynamic_pointer_cast>( - target_label_as_feat); - auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); - auto labels = std::make_shared(labels_vec); - return labels; - } - break; - // nominal features - case ARFFDeserializer::Attribute::NOMINAL: - { - auto casted_feat = std::dynamic_pointer_cast>( - target_label_as_feat); - auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); - for(auto& val: labels_vec) - { - if (val == 0) - val = -1; - } - auto labels = std::make_shared(labels_vec); - return labels; - } - break; - default: - SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) - } - - return nullptr; -} - -void OpenMLData::get_data() -{ - auto reader = OpenMLReader(m_api_key); - std::shared_ptr ss = - std::make_shared(reader.get(m_url)); - - auto parser = ARFFDeserializer(ss); - parser.read(); - m_cached_features = parser.get_features(); - m_feature_names = parser.get_feature_names(); - m_feature_types = parser.get_attribute_types(); -} - -std::shared_ptr -OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) -{ - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("get_split", "split", split_url); - - if (return_string == "Task not providing datasplits.") - return std::make_shared(); - - auto return_stream = std::istringstream(return_string); - // TODO: add ARFF parsing here - // get train/test indices - // TODO: replace line below with ARFFDeserialiser::get_features() - auto arff_features = std::make_shared(); - REQUIRE( - arff_features->get_num_feature_obj() == 4, - "Expected a ARFF file with 4 attributes: type, rowid, repeat and " - "fold.\n") - - auto train_test_feat = - std::shared_ptr(arff_features->get_feature_obj(0)); - auto rowid_feat = - std::shared_ptr(arff_features->get_feature_obj(1)); - auto repeat_feat = - std::shared_ptr(arff_features->get_feature_obj(2)); - auto fold_feat = - std::shared_ptr(arff_features->get_feature_obj(3)); - - auto type_vector = string_feature_to_vector(train_test_feat); - auto rowid_vector = dense_feature_to_vector(rowid_feat); - auto repeat_vector = dense_feature_to_vector(repeat_feat); - auto fold_vector = dense_feature_to_vector(fold_feat); - - std::vector> train_idx, test_idx; - for (int i = 0; i < arff_features->get_num_vectors(); ++i) - { - if (type_vector[i] == LabelType::TRAIN) - train_idx.emplace_back(std::initializer_list{ - static_cast(rowid_vector[i]), - static_cast(repeat_vector[i]), - static_cast(fold_vector[i])}); - else - test_idx.emplace_back(std::initializer_list{ - static_cast(rowid_vector[i]), - static_cast(repeat_vector[i]), - static_cast(fold_vector[i])}); - } - - return std::make_shared(train_idx, test_idx); -} - -SGVector -OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) -{ - auto casted_feat = - std::dynamic_pointer_cast>(feat); - // this should never happen - if (!casted_feat) - SG_SERROR("Error casting a column in the split file from CFeatures to " - "CDenseFeatures!\n>"); - return casted_feat->get_feature_vector(0); -} - -std::vector -OpenMLSplit::string_feature_to_vector(const std::shared_ptr& feat) -{ - auto casted_feat = std::dynamic_pointer_cast>(feat); - // this should never happen - if (!casted_feat) - SG_SERROR("Error casting a column in the split file from CFeatures to " - "CStringFeatures!\n"); - - auto to_lower = [](const std::string& line) { - std::string result; - std::transform( - line.begin(), line.end(), std::back_inserter(result), - [](uint8_t val) { return std::tolower(val); }); - return result; - }; - - std::vector result; - - for (int i = 0; i < casted_feat->get_num_vectors(); ++i) - { - auto row = casted_feat->get_feature_vector(i); - std::string label(1, row[0]); - for (auto j = 1; j < casted_feat->get_max_vector_length(); ++j) - label.append(1, row[j]); - if (to_lower(label) == "train") - result.push_back(LabelType::TRAIN); - else if (to_lower(label) == "test") - result.push_back(LabelType::TEST); - else - SG_SERROR("Unknown label type in split file %s!\n", label.c_str()) - } - return result; -} - -std::shared_ptr -OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) -{ - Document document; - std::string task_name; - std::string task_type_id; - std::shared_ptr openml_dataset = nullptr; - std::shared_ptr openml_split = nullptr; - std::unordered_map evaluation_measures; - - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("task_file", "json", task_id); - - document.Parse(return_string.c_str()); - check_response(document, "task"); - - const Value& root = document["task"]; - - REQUIRE( - task_id == root["task_id"].GetString(), - "Expected downloaded task to have the same id as the requested task " - "id, but got \"%s\", instead of \"%s\".\n", - root["task_id"].GetString(), task_id.c_str()) - - task_name = root["task_name"].GetString(); - OpenMLTask::TaskType task_type = - get_task_from_string(root["task_type"].GetString()); - task_type_id = root["task_type_id"].GetString(); - - // expect two elements in input array: dataset and split - const Value& json_input = root["input"]; - - auto input_array = json_input.GetArray(); - - for (const auto& task_settings : input_array) - { - if (strcmp(task_settings["name"].GetString(), "source_data") == 0) - { - auto dataset_info = task_settings["data_set"].GetObject(); - std::string dataset_id = dataset_info["data_set_id"].GetString(); - std::string target_feature = - dataset_info["target_feature"].GetString(); - openml_dataset = OpenMLData::get_dataset(dataset_id, api_key); - } - else if ( - strcmp(task_settings["name"].GetString(), "estimation_procedure") == - 0) - { - auto split_info = task_settings["estimation_procedure"].GetObject(); - std::string split_id = split_info["id"].GetString(); - std::string split_type = split_info["type"].GetString(); - std::string split_url = split_info["data_splits_url"].GetString(); - std::unordered_map split_parameters; - for (const auto& param : split_info["parameter"].GetArray()) - { - if (param.HasMember("name") && param.HasMember("value")) - split_parameters.emplace( - param["name"].GetString(), param["value"].GetString()); - else if (param.HasMember("name")) - split_parameters.emplace(param["name"].GetString(), ""); - else - SG_SERROR( - "Unexpected number of parameters in parameter array " - "of estimation_procedure.\n") - } - REQUIRE( - split_type == "crossvalidation", - "Currently only tasks with cross validation are enabled in " - "shogun!\n") - openml_split = OpenMLSplit::get_split(split_url, api_key); - } - else if ( - strcmp(task_settings["name"].GetString(), "evaluation_measures") == - 0) - { - auto evaluation_info = - task_settings["evaluation_measures"].GetObject(); - for (const auto& param : evaluation_info) - { - if (param.value.IsString()) - evaluation_measures.emplace( - param.name.GetString(), param.value.GetString()); - else - evaluation_measures.emplace(param.name.GetString(), ""); - } - } - } - - if (openml_dataset == nullptr && openml_split == nullptr) - SG_SERROR("Error parsing task.\n") - - auto result = std::make_shared( - task_id, task_name, task_type, task_type_id, evaluation_measures, - openml_split, openml_dataset); - - return result; -} - -OpenMLTask::TaskType -OpenMLTask::get_task_from_string(const std::string& task_type) -{ - if (task_type == "Supervised Classification") - return OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION; - SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) -} - -SGMatrix OpenMLTask::get_train_indices() const -{ - SG_SNOTIMPLEMENTED - return SGMatrix(); -} - -SGMatrix OpenMLTask::get_test_indices() const -{ - SG_SNOTIMPLEMENTED - return SGMatrix(); -} - -/** - * Class using the Any visitor pattern to convert - * a string to a C++ type that can be used as a parameter - * in a Shogun model. If the string value is not "null" it will - * be put in its casted type in the given model with the provided parameter - * name. If the value is null nothing happens, i.e. no error is thrown - * and no value is put in model. - */ -class StringToShogun : public AnyVisitor -{ -public: - explicit StringToShogun(std::shared_ptr model) - : m_model(model), m_parameter(""), m_string_val(""){}; - - StringToShogun( - std::shared_ptr model, const std::string& parameter, - const std::string& string_val) - : m_model(model), m_parameter(parameter), m_string_val(string_val){}; - - void on(bool* v) final - { - SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - bool result = strcmp(m_string_val.c_str(), "true") == 0; - m_model->put(m_parameter, result); - } - } - void on(int32_t* v) final - { - SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - try - { - int32_t result = std::stoi(m_string_val); - m_model->put(m_parameter, result); - } - catch (const std::invalid_argument&) - { - // it's an option, i.e. internally represented - // as an enum but in swig exposed as a string - m_string_val.erase( - std::remove_if( - m_string_val.begin(), m_string_val.end(), - // remove quotes - [](const auto& val) { return val == '\"'; }), - m_string_val.end()); - m_model->put(m_parameter, m_string_val); - } - } - } - void on(int64_t* v) final - { - SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - - int64_t result = std::stol(m_string_val); - m_model->put(m_parameter, result); - } - } - void on(float* v) final - { - SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - float32_t result = std::stof(m_string_val); - m_model->put(m_parameter, result); - } - } - void on(double* v) final - { - SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - float64_t result = std::stod(m_string_val); - m_model->put(m_parameter, result); - } - } - void on(long double* v) - { - SG_SDEBUG( - "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - floatmax_t result = std::stold(m_string_val); - m_model->put(m_parameter, result); - } - } - void on(CSGObject** v) final - { - SG_SDEBUG( - "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - } - void on(SGVector* v) final - { - SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - } - void on(SGVector* v) final - { - SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) - } - void on(SGVector* v) final - { - SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) - } - void on(SGMatrix* mat) final - { - SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - } - void on(SGMatrix* mat) final - { - SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) - } - void on(SGMatrix* mat) final{SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} - - /** - * In OpenML "null" is an empty parameter value field. - * @return whether the field is "null" - */ - SG_FORCED_INLINE bool is_null() const noexcept - { - bool result = strcmp(m_string_val.c_str(), "null") == 0; - return result; - } - - SG_FORCED_INLINE void set_parameter_name(const std::string& name) noexcept - { - m_parameter = name; - } - - SG_FORCED_INLINE void set_string_value(const std::string& value) noexcept - { - m_string_val = value; - } - -private: - std::shared_ptr m_model; - std::string m_parameter; - std::string m_string_val; -}; - -/** - * Instantiates a CSGObject using a factory - * @param factory_name the name of the factory - * @param algo_name the name of algorithm passed to factory - * @return the instantiated object using a factory - */ -std::shared_ptr instantiate_model_from_factory( - const std::string& factory_name, const std::string& algo_name) -{ - if (factory_name == "machine") - return std::shared_ptr(machine(algo_name)); - if (factory_name == "kernel") - return std::shared_ptr(kernel(algo_name)); - if (factory_name == "distance") - return std::shared_ptr(distance(algo_name)); - - SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) - - return nullptr; -} - -/** - * Downcasts a CSGObject and puts it in the map of obj. - * @param obj the main object - * @param nested_obj the object to be casted and put in the obj map. - * @param parameter_name the name of nested_obj - */ -void cast_and_put( - const std::shared_ptr& obj, - const std::shared_ptr& nested_obj, - const std::string& parameter_name) -{ - if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) - { - // TODO: remove clone - // temporary fix until shared_ptr PR merged - auto* tmp_clone = dynamic_cast(casted_obj->clone()); - obj->put(parameter_name, tmp_clone); - return; - } - if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) - { - auto* tmp_clone = dynamic_cast(casted_obj->clone()); - obj->put(parameter_name, tmp_clone); - return; - } - if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) - { - auto* tmp_clone = dynamic_cast(casted_obj->clone()); - obj->put(parameter_name, tmp_clone); - return; - } - SG_SERROR("Could not cast SGObject.\n") -} - -std::shared_ptr ShogunOpenML::flow_to_model( - std::shared_ptr flow, bool initialize_with_defaults) -{ - auto params = flow->get_parameters(); - auto components = flow->get_components(); - auto class_name = get_class_info(flow->get_class_name()); - auto module_name = class_name.first; - auto algo_name = class_name.second; - - auto obj = instantiate_model_from_factory(module_name, algo_name); - auto obj_param = obj->get_params(); - - auto visitor = std::make_unique(obj); - - if (initialize_with_defaults) - { - for (const auto& param : params) - { - Any any_val = obj_param.at(param.first)->get_value(); - std::string name = param.first; - std::string val_as_string = param.second.at("default_value"); - visitor->set_parameter_name(name); - visitor->set_string_value(val_as_string); - any_val.visit(visitor.get()); - } - } - - for (const auto& component : components) - { - std::shared_ptr nested_obj = - flow_to_model(component.second, initialize_with_defaults); - cast_and_put(obj, nested_obj, component.first); - } - - SG_SDEBUG("Final object: %s.\n", obj->to_string().c_str()); - - return obj; -} - -std::shared_ptr -ShogunOpenML::model_to_flow(const std::shared_ptr& model) -{ - return std::shared_ptr(); -} - -std::pair -ShogunOpenML::get_class_info(const std::string& class_name) -{ - std::vector class_components; - auto begin = class_name.begin(); - std::pair result; - - for (auto it = class_name.begin(); it != class_name.end(); ++it) - { - if (*it == '.') - { - class_components.emplace_back(std::string(begin, it)); - begin = std::next(it); - } - if (std::next(it) == class_name.end()) - class_components.emplace_back(std::string(begin, std::next(it))); - } - - if (class_components[0] == "shogun" && class_components.size() == 3) - result = std::make_pair(class_components[1], class_components[2]); - else if (class_components[0] == "shogun" && class_components.size() != 3) - SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) - else - SG_SERROR( - "The provided flow is not meant for shogun deserialisation! The " - "required library is \"%s\".\n", - class_components[0].c_str()) - - return result; -} - -std::shared_ptr ShogunOpenML::run_model_on_fold( - const std::shared_ptr& model, - const std::shared_ptr& task, - const std::shared_ptr& X_train, index_t repeat_number, - index_t fold_number, const std::shared_ptr& y_train, - const std::shared_ptr& X_test) -{ - auto task_type = task->get_task_type(); - auto model_clone = std::shared_ptr(model->clone()); - - switch (task_type) - { - case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: - case OpenMLTask::TaskType::SUPERVISED_REGRESSION: - { - if (auto machine = std::dynamic_pointer_cast(model_clone)) - { - // TODO: refactor. more useless clones until smart pointers are merged - machine->put("labels", y_train->clone()->as()); - auto tmp = X_train.get(); - machine->train(tmp); - if (X_test) - return std::shared_ptr(machine->apply(X_test.get())); - else - return std::shared_ptr(machine->apply(X_train.get())); - } - else - SG_SERROR("The provided model is not a trainable machine!\n") - } - break; - case OpenMLTask::TaskType::LEARNING_CURVE: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::CLUSTERING: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: - SG_SNOTIMPLEMENTED - } - return nullptr; -} - -std::shared_ptr OpenMLRun::run_model_on_task( - std::shared_ptr model, std::shared_ptr task) -{ - SG_SNOTIMPLEMENTED - return std::shared_ptr(); -} - -std::shared_ptr OpenMLRun::run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task) -{ - auto data = task->get_dataset(); - std::shared_ptr train_features = nullptr, test_features = nullptr; - std::shared_ptr train_labels = nullptr, test_labels = nullptr; - - if (task->get_split()->contains_splits()) - SG_SNOTIMPLEMENTED - else - { - train_labels = data->get_labels(); - train_features = - data->get_features(data->get_default_target_attribute()); - // ensures delete is called by shared ptr destructor - SG_REF(train_labels.get()) - SG_REF(train_features.get()) - auto model = ShogunOpenML::flow_to_model(std::move(flow), true); - - if (auto machine = std::dynamic_pointer_cast(model)) - { - auto result = ShogunOpenML::run_model_on_fold( - machine, task, train_features, 0, 0, train_labels, - test_features); - SG_SDEBUG(result->to_string().c_str()); - } - else - SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") - } - return std::shared_ptr(); -} - -std::shared_ptr -OpenMLRun::from_filesystem(const std::string& directory) -{ - SG_SNOTIMPLEMENTED - return nullptr; -} - -void OpenMLRun::to_filesystem(const std::string& directory) const -{ - SG_SNOTIMPLEMENTED -} - -void OpenMLRun::publish() const -{ - SG_SNOTIMPLEMENTED -} diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h deleted file mode 100644 index 28d22fcf0d5..00000000000 --- a/src/shogun/io/OpenMLFlow.h +++ /dev/null @@ -1,683 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Gil Hoben - */ - -#ifndef SHOGUN_OPENMLFLOW_H -#define SHOGUN_OPENMLFLOW_H - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace shogun -{ - /** - * Reads OpenML streams which can be downloaded with this function. - */ - class OpenMLReader - { - - public: - explicit OpenMLReader(const std::string& api_key) : m_api_key(api_key) - { - } - - /** - * Returns a string returned by the server given a request. - * Raises an error if the returned code is not 200. - * Additional arguments can be passed to the request, - * which are then concatenated with a "/" character. - * - * @tparam Args argument type pack, should all be std::string - * @param request the request name, see m_request_options - * @param format the format to return the data in, see m_format_options - * @param args the additional arguments to be passed to request - * @return the returned stream from the server if the return code is 200 - */ - template - std::string - get(const std::string& request, const std::string& format, Args... args) - { - std::string request_path; - auto find_format = m_format_options.find(format); - if (find_format == m_format_options.end()) - { - SG_SERROR( - "The provided format \"%s\" is not available\n", - format.c_str()) - } - - if (format == "split") - { - REQUIRE( - request == "get_split", - "Split server can only handle \"get_split\" request.\n") - request_path = get_split; - } - else - { - auto find_request = m_request_options.find(request); - if (find_request == m_request_options.end()) - { - SG_SERROR( - "Could not find a way to solve the request \"%s\"\n", - request.c_str()) - } - request_path = find_request->second; - } - - std::string request_format = find_format->second; - - // get additional args and concatenate them with "/" - if (sizeof...(Args) > 0) - { - if (request_path.substr(request_path.size() - 2) == "{}") - { - request_path = - request_path.substr(0, request_path.size() - 2); - } - else - { - SG_SERROR( - "The provided request \"%s\" cannot handle additional " - "args.\n", - request.c_str()) - } - std::vector args_vec = {args...}; - std::string args_string = std::accumulate( - args_vec.begin() + 1, args_vec.end(), args_vec.front(), - [](std::string s0, std::string& s1) { - return s0 += "/" + s1; - }); - request_path += args_string; - } - - std::string url = request_format + request_path + "?" + m_api_key; - - return get(url); - } - - std::string get(const std::string& url) - { -#ifdef HAVE_CURL - // clear the buffer before request - m_curl_response_buffer.clear(); - - openml_curl_request_helper(url); - return m_curl_response_buffer; -#else - SG_SERROR( - "This function is only available with the CURL library!\n") -#endif // HAVE_CURL - } - - private: - /** the raw buffer as a C++ string */ - std::string m_curl_response_buffer; - - /** - * Initialises CURL session and gets the data. - * This function also handles the response code from the server. - * - * @param url the url to query - */ - void openml_curl_request_helper(const std::string& url); - - /** the user API key, not required for all requests */ - std::string m_api_key; - - /** the server path to get a response in XML format*/ - static const char* xml_server; - /** the server path to get a response in JSON format*/ - static const char* json_server; - /** the server path to download datasets */ - static const char* download_server; - /** the server path to get a split in ARFF format */ - static const char* splits_server; - - /** the server response format options: XML or JSON */ - static const std::unordered_map - m_format_options; - /** all the supported server options */ - static const std::unordered_map - m_request_options; - - /* DATA API */ - static const char* dataset_description; - static const char* list_data_qualities; - static const char* data_features; - static const char* data_qualities; - static const char* list_dataset_qualities; - static const char* list_dataset_filter; - - /* FLOW API */ - static const char* flow_file; - - /* TASK API */ - static const char* task_file; - - /* SPLIT API */ - static const char* get_split; - }; - - /** - * Writes OpenML streams to the OpenML server. - */ - class OpenMLWritter - { - public: - OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; - - private: - /** the user API key, likely to be needed to write to OpenML */ - std::string m_api_key; - }; - - /** - * Handles OpenML flows. A flow contains the information - * required to instantiate a model. - */ - class OpenMLFlow - { - - public: - /** alias for component type, map of flows */ - using components_type = - std::unordered_map>; - /** alias for parameter type, map of maps with information specific to a - * parameter */ - using parameters_type = std::unordered_map< - std::string, std::unordered_map>; - - /** - * The OpenMLFlow constructor. This constructor is rarely used by the - * user and is used by the static class members download_flow and - * from_file. The user is expected to use either of the previously - * mentioned functions. - * - * @param name the model name - * @param description the model description - * @param model the flow class_name field - * @param components a map of subflows, i.e. kernels - * @param parameters a map of parameter information, i.e. default values - * for each parameter name - */ - OpenMLFlow( - const std::string& name, const std::string& description, - const std::string& model, components_type components, - parameters_type parameters) - : m_name(name), m_description(description), m_class_name(model), - m_parameters(std::move(parameters)), - m_components(std::move(components)) - { - } - - /** - * Instantiates a OpenMLFlow by downloaded a flow from the OpenML - * server. - * - * @param flow_id the flow ID - * @param api_key the user API key (might not be required and can be an - * empty string) - * @return the OpenMLFlow corresponding to the flow requested - * @throws ShogunException when there is a server error or the requested - * flow is ill formed. - */ - static std::shared_ptr - download_flow(const std::string& flow_id, const std::string& api_key); - - /** - * Instantiates a OpenMLFlow from a file. - * @return the OpenMLFlow corresponding to the flow requested - */ - static std::shared_ptr from_file(); - - /** - * Publishes a flow to the OpenML server - * @param flow the flow to be published - */ - static void upload_flow(const std::shared_ptr& flow); - - /** - * Dumps the OpenMLFlow to disk. - */ - void dump() const; - - /** - * Gets a subflow, i.e. a kernel in a machine - * @param name the name of the subflow, not the flow ID - * @return the subflow if it exists - */ - std::shared_ptr get_subflow(const std::string& name) - { - auto find_flow = m_components.find(name); - if (find_flow != m_components.end()) - return find_flow->second; - else - SG_SERROR( - "The provided subflow could not be found in this flow!") - return nullptr; - } - -#ifndef SWIG - SG_FORCED_INLINE parameters_type get_parameters() const noexcept - { - return m_parameters; - } - - SG_FORCED_INLINE components_type get_components() const noexcept - { - return m_components; - } - - SG_FORCED_INLINE std::string get_class_name() const noexcept - { - return m_class_name; - } -#endif // SWIG - - private: - /** name field of the flow */ - std::string m_name; - /** description field of the flow */ - std::string m_description; - /** the class_name field of the flow */ - std::string m_class_name; - /** the parameter field of the flow (optional) */ - parameters_type m_parameters; - /** the components fields of the flow (optional) */ - components_type m_components; - }; - - /** - * Handles an OpenML dataset. - */ - class OpenMLData - { - public: - OpenMLData( - const std::string& name, const std::string& description, - const std::string& data_format, const std::string& dataset_id, - const std::string& version, const std::string& creator, - const std::string& contributor, const std::string& collection_date, - const std::string& upload_date, const std::string& language, - const std::string& license, const std::string& url, - const std::string& default_target_attribute, - const std::string& row_id_attribute, - const std::string& ignore_attribute, - const std::string& version_label, const std::string& citation, - std::vector tag, const std::string& visibility, - const std::string& original_data_url, const std::string& paper_url, - const std::string& update_comment, const std::string& md5_checksum, - std::vector< - std::unordered_map>> - param_descriptors, - std::vector> - param_qualities) - : m_name(name), m_description(description), - m_data_format(data_format), m_dataset_id(dataset_id), - m_version(version), m_creator(creator), - m_contributor(contributor), m_collection_date(collection_date), - m_upload_date(upload_date), m_language(language), - m_license(license), m_url(url), - m_default_target_attribute(default_target_attribute), - m_row_id_attribute(row_id_attribute), - m_ignore_attribute(ignore_attribute), - m_version_label(version_label), m_citation(citation), - m_tag(std::move(tag)), m_visibility(visibility), - m_original_data_url(original_data_url), m_paper_url(paper_url), - m_update_comment(update_comment), m_md5_checksum(md5_checksum), - m_param_descriptors(std::move(param_descriptors)), - m_param_qualities(std::move(param_qualities)) - { - } - - /** - * Creates a dataset instance from a given ID. - * - */ - static std::shared_ptr - get_dataset(const std::string& id, const std::string& api_key); - - /** - * Returns ALL the features of the dataset, potentially also the labels - * column - * @return the features - */ - std::shared_ptr get_features() noexcept; - - /** - * Returns the dataset features - * @param label_name the name of the attribute containing the label - * @return the features - */ - std::shared_ptr - get_features(const std::string& label_name); - - /** - * Returns the dataset labels if m_default_target_attribute is not empty - * @return the labels - */ - std::shared_ptr get_labels(); - - /** - * Returns the dataset labels given the label_name - * @return the labels - */ - std::shared_ptr get_labels(const std::string& label_name); - - /** - * Returns the type of all attributes/features in the ARFF file - * @return - */ - SG_FORCED_INLINE std::vector - get_feature_types() const noexcept - { - return m_feature_types; - } - - SG_FORCED_INLINE std::string get_default_target_attribute() const noexcept - { - return m_default_target_attribute; - } - - protected: - SG_FORCED_INLINE void set_api_key(const std::string& api_key) noexcept - { - m_api_key = api_key; - } - - private: - void get_data(); - - std::string m_name; - std::string m_description; - std::string m_data_format; - std::string m_dataset_id; - std::string m_version; - std::string m_creator; - std::string m_contributor; - std::string m_collection_date; - std::string m_upload_date; - std::string m_language; - std::string m_license; - std::string m_url; - std::string m_default_target_attribute; - std::string m_row_id_attribute; - std::string m_ignore_attribute; - std::string m_version_label; - std::string m_citation; - std::vector m_tag; - std::string m_visibility; - std::string m_original_data_url; - std::string m_paper_url; - std::string m_update_comment; - std::string m_md5_checksum; - std::vector>> - m_param_descriptors; - std::vector> - m_param_qualities; - std::string m_api_key; - - std::shared_ptr m_cached_features; - std::vector m_feature_names; - std::vector m_feature_types; - }; - - /** - * Handles an OpenML split. - */ - class OpenMLSplit - { - public: - enum class LabelType - { - TRAIN = 1, - TEST = 2 - }; - - /** - * Default constructor. This is used when there are no - * train or test indices. - */ - OpenMLSplit() = default; - - OpenMLSplit( - std::vector> train_idx, - std::vector> test_idx) - : m_train_idx(std::move(train_idx)), m_test_idx(std::move(test_idx)) - { - } - - static std::shared_ptr - get_split(const std::string& split_url, const std::string& api_key); - - SG_FORCED_INLINE std::vector> get_train_idx() const - noexcept - { - return m_train_idx; - } - - SG_FORCED_INLINE std::vector> get_test_idx() const - noexcept - { - return m_test_idx; - } - - SG_FORCED_INLINE bool contains_splits() const noexcept - { - return !m_train_idx.empty() && !m_test_idx.empty(); - } - - private: - static SGVector - dense_feature_to_vector(const std::shared_ptr& feat); - - static std::vector - string_feature_to_vector(const std::shared_ptr& feat); - - std::vector> m_train_idx; - std::vector> m_test_idx; - }; - - /** - * Handles OpenML tasks. A task contains all the information - * required to train and test a model. - */ - class OpenMLTask - { - public: - enum class TaskType - { - SUPERVISED_CLASSIFICATION = 0, - SUPERVISED_REGRESSION = 1, - LEARNING_CURVE = 2, - SUPERVISED_DATASTREAM_CLASSIFICATION = 3, - CLUSTERING = 4, - MACHINE_LEARNING_CHALLENGE = 5, - SURVIVAL_ANALYSIS = 6, - SUBGROUP_DISCOVERY = 7 - }; - - enum class TaskEvaluation - { - - }; - - OpenMLTask( - const std::string& task_id, const std::string task_name, - TaskType task_type, const std::string& task_type_id, - std::unordered_map evaluation_measures, - std::shared_ptr split, - std::shared_ptr data) - : m_task_id(task_id), m_task_name(task_name), - m_task_type(task_type), m_task_type_id(task_type_id), - m_evaluation_measures(std::move(evaluation_measures)), - m_split(std::move(split)), m_data(std::move(data)) - { - } - - static std::shared_ptr - get_task(const std::string& task_id, const std::string& api_key); - - std::shared_ptr get_dataset() const noexcept - { - return m_data; - } - - std::shared_ptr get_split() const noexcept - { - return m_split; - } - - SGMatrix get_train_indices() const; - - SGMatrix get_test_indices() const; - -#ifndef SWIG - SG_FORCED_INLINE TaskType get_task_type() const noexcept - { - return m_task_type; - } -#endif // SWIG - - private: - static TaskType get_task_from_string(const std::string& task_type); - - std::string m_task_id; - std::string m_task_name; - TaskType m_task_type; - std::string m_task_type_id; - std::unordered_map m_evaluation_measures; - std::shared_ptr m_split; - std::shared_ptr m_data; - }; - - class OpenMLRun; - /** - * The Shogun OpenML extension to run models from an OpenMLFlow - * and convert models to OpenMLFlow. - */ - class ShogunOpenML - { - public: - friend class OpenMLRun; - /** - * Instantiates a SGObject from an OpenMLFlow. - * - * @param flow the flow to instantiate - * @param initialize_with_defaults whether to use the default values - * specified in the flow - * @return the flow as a trainable model - */ - static std::shared_ptr flow_to_model( - std::shared_ptr flow, bool initialize_with_defaults); - - /** - * Converts a SGObject to an OpenMLFlow. - * - * @param model the model to convert - * @return the flow from the model conversion - */ - static std::shared_ptr - model_to_flow(const std::shared_ptr& model); - - protected: - static std::shared_ptr run_model_on_fold( - const std::shared_ptr& model, - const std::shared_ptr& task, - const std::shared_ptr& X_train, - index_t repeat_number, index_t fold_number, - const std::shared_ptr& y_train, - const std::shared_ptr& X_test); - - private: - /** - * Helper function to extract module/factory information from the - * class name field of OpenMLFlow. Throws an error either if the - * class name field is ill formed (i.e. not - * library.module.algorithm) or if the library name is not "shogun". - * - * @param class_name the flow class_name field - * @return a tuple with the module name (factory string) and the - * algorithm name - */ - static std::pair - get_class_info(const std::string& class_name); - }; - - class OpenMLRun - { - public: - OpenMLRun( - const std::string& uploader, const std::string& uploader_name, - const std::string& setup_id, const std::string& setup_string, - const std::string& parameter_settings, - std::vector evaluations, - std::vector fold_evaluations, - std::vector sample_evaluations, - const std::string& data_content, - std::vector output_files, - std::shared_ptr task, std::shared_ptr flow, - const std::string& run_id, std::shared_ptr model, - std::vector tags, std::string predictions_url) - : m_uploader(uploader), m_uploader_name(uploader_name), - m_setup_id(setup_id), m_setup_string(setup_string), - m_parameter_settings(parameter_settings), - m_evaluations(std::move(evaluations)), - m_fold_evaluations(std::move(fold_evaluations)), - m_sample_evaluations(std::move(sample_evaluations)), - m_data_content(data_content), - m_output_files(std::move(output_files)), m_task(std::move(task)), - m_flow(std::move(flow)), m_run_id(run_id), - m_model(std::move(model)), m_tags(std::move(tags)), - m_predictions_url(std::move(predictions_url)) - { - } - - static std::shared_ptr - from_filesystem(const std::string& directory); - - static std::shared_ptr run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task); - - static std::shared_ptr run_model_on_task( - std::shared_ptr model, std::shared_ptr task); - - void to_filesystem(const std::string& directory) const; - - void publish() const; - - private: - std::string m_uploader; - std::string m_uploader_name; - std::string m_setup_id; - std::string m_setup_string; - std::string m_parameter_settings; - std::vector m_evaluations; - std::vector m_fold_evaluations; - std::vector m_sample_evaluations; - std::string m_data_content; - std::vector m_output_files; - std::shared_ptr m_task; - std::shared_ptr m_flow; - std::string m_run_id; - std::shared_ptr m_model; - std::vector m_tags; - std::string m_predictions_url; - }; -} // namespace shogun - -#endif // SHOGUN_OPENMLFLOW_H diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp new file mode 100644 index 00000000000..66dc3dcf482 --- /dev/null +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -0,0 +1,255 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +#include +#include +#include + +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + +std::shared_ptr +OpenMLData::get_dataset(const std::string& id, const std::string& api_key) +{ + // description + Document document; + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("dataset_description", "json", id); + + document.Parse(return_string.c_str()); + check_response(document, "data_set_description"); + + const Value& dataset_description = document["data_set_description"]; + + auto name = return_if_possible( + "name", dataset_description.GetObject()); + auto description = return_if_possible( + "description", dataset_description.GetObject()); + auto data_format = return_if_possible( + "data_format", dataset_description.GetObject()); + auto dataset_id = + return_if_possible("id", dataset_description.GetObject()); + auto version = return_if_possible( + "version", dataset_description.GetObject()); + auto creator = return_if_possible( + "creator", dataset_description.GetObject()); + auto contributor = return_if_possible( + "contributor", dataset_description.GetObject()); + auto collection_date = return_if_possible( + "collection_date", dataset_description.GetObject()); + auto upload_date = return_if_possible( + "upload_date", dataset_description.GetObject()); + auto language = return_if_possible( + "language", dataset_description.GetObject()); + auto licence = return_if_possible( + "licence", dataset_description.GetObject()); + auto url = + return_if_possible("url", dataset_description.GetObject()); + auto default_target_attribute = return_if_possible( + "default_target_attribute", dataset_description.GetObject()); + auto row_id_attribute = return_if_possible( + "row_id_attribute", dataset_description.GetObject()); + auto ignore_attribute = return_if_possible( + "ignore_attribute", dataset_description.GetObject()); + auto version_label = return_if_possible( + "version_label", dataset_description.GetObject()); + auto citation = return_if_possible( + "citation", dataset_description.GetObject()); + auto tags = return_if_possible>( + "tag", dataset_description.GetObject()); + auto visibility = return_if_possible( + "visibility", dataset_description.GetObject()); + auto original_data_url = return_if_possible( + "original_data_url", dataset_description.GetObject()); + auto paper_url = return_if_possible( + "paper_url", dataset_description.GetObject()); + auto update_comment = return_if_possible( + "update_comment", dataset_description.GetObject()); + auto md5_checksum = return_if_possible( + "md5_checksum", dataset_description.GetObject()); + + // features + std::vector>> + param_vector; + return_string = reader.get("data_features", "json", id); + document.Parse(return_string.c_str()); + check_response(document, "data_features"); + const Value& dataset_features = document["data_features"]; + for (const auto& param : dataset_features["feature"].GetArray()) + { + std::unordered_map> param_map; + for (const auto& param_descriptors : param.GetObject()) + { + std::vector second; + if (param_descriptors.value.IsArray()) + for (const auto& v : param_descriptors.value.GetArray()) + second.emplace_back(v.GetString()); + else + second.emplace_back(param_descriptors.value.GetString()); + + param_map.emplace(param_descriptors.name.GetString(), second); + } + param_vector.push_back(param_map); + } + + // qualities + std::vector> qualities_vector; + return_string = reader.get("data_qualities", "json", id); + document.Parse(return_string.c_str()); + check_response(document, "data_qualities"); + const Value& data_qualities = document["data_qualities"]; + for (const auto& param : data_qualities["quality"].GetArray()) + { + std::unordered_map param_map; + for (const auto& param_quality : param.GetObject()) + { + if (param_quality.name.IsString() && param_quality.value.IsString()) + param_map.emplace( + param_quality.name.GetString(), + param_quality.value.GetString()); + else if (param_quality.name.IsString()) + param_map.emplace(param_quality.name.GetString(), ""); + } + qualities_vector.push_back(param_map); + } + + auto result = std::make_shared( + name, description, data_format, dataset_id, version, creator, + contributor, collection_date, upload_date, language, licence, url, + default_target_attribute, row_id_attribute, ignore_attribute, + version_label, citation, tags, visibility, original_data_url, paper_url, + update_comment, md5_checksum, param_vector, qualities_vector); + result->set_api_key(api_key); + return result; +} + +std::shared_ptr OpenMLData::get_features() noexcept +{ + if (!m_cached_features) + get_data(); + return m_cached_features; +} + +std::shared_ptr OpenMLData::get_features(const std::string& label) +{ + if (!m_cached_features) + get_data(); + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label); + if (find_label == m_feature_names.end()) + SG_SERROR("Requested label \"%s\" not in the dataset!\n", label.c_str()) + if (!m_cached_features) + get_data(); + auto col_idx = std::distance(m_feature_names.begin(), find_label); + auto feat_type_copy = m_feature_types; + feat_type_copy.erase(feat_type_copy.begin() + col_idx); + for (const auto type : feat_type_copy) + { + if (type == ARFFDeserializer::Attribute::STRING) + SG_SERROR("Currently cannot process string features!\n") + } + std::shared_ptr result; + bool first = true; + for (int i = 0; i < m_feature_types.size(); ++i) + { + if (i != col_idx && first) + { + result.reset(m_cached_features->get_feature_obj(i)); + first = false; + } + if (i != col_idx) + result.reset(result->create_merged_copy( + m_cached_features->get_feature_obj(i))); + } + std::dynamic_pointer_cast>(result)->set_num_features(m_feature_types.size()); + std::dynamic_pointer_cast>(result)->set_num_vectors(m_cached_features->get_num_vectors()); + + return result; +} + +std::shared_ptr OpenMLData::get_labels() +{ + if (!m_cached_features) + get_data(); + REQUIRE( + !m_default_target_attribute.empty(), + "A default target attribute is required if no label is given!\n") + return get_labels(m_default_target_attribute); +} + +std::shared_ptr OpenMLData::get_labels(const std::string& label_name) +{ + if (!m_cached_features) + get_data(); + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label_name); + if (find_label == m_feature_names.end()) + SG_SERROR( + "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) + auto col_idx = std::distance(m_feature_names.begin(), find_label); + + if (!m_cached_features) + get_data(); + + auto target_label_as_feat = + std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); + + switch (m_feature_types[col_idx]) + { + // real features + case ARFFDeserializer::Attribute::REAL: + case ARFFDeserializer::Attribute::NUMERIC: + case ARFFDeserializer::Attribute::INTEGER: + case ARFFDeserializer::Attribute::DATE: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + auto labels = std::make_shared(labels_vec); + return labels; + } + break; + // nominal features + case ARFFDeserializer::Attribute::NOMINAL: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + for(auto& val: labels_vec) + { + if (val == 0) + val = -1; + } + auto labels = std::make_shared(labels_vec); + return labels; + } + break; + default: + SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) + } + + return nullptr; +} + +void OpenMLData::get_data() +{ + auto reader = OpenMLReader(m_api_key); + std::shared_ptr ss = + std::make_shared(reader.get(m_url)); + + auto parser = ARFFDeserializer(ss); + parser.read(); + m_cached_features = parser.get_features(); + m_feature_names = parser.get_feature_names(); + m_feature_types = parser.get_attribute_types(); +} diff --git a/src/shogun/io/openml/OpenMLData.h b/src/shogun/io/openml/OpenMLData.h new file mode 100644 index 00000000000..7e21474775b --- /dev/null +++ b/src/shogun/io/openml/OpenMLData.h @@ -0,0 +1,162 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLDATA_H +#define SHOGUN_OPENMLDATA_H + +#include +#include + +#include +#include +#include + +namespace shogun +{ + /** + * Handles an OpenML dataset. + */ + class OpenMLData + { + public: + OpenMLData( + const std::string& name, const std::string& description, + const std::string& data_format, const std::string& dataset_id, + const std::string& version, const std::string& creator, + const std::string& contributor, const std::string& collection_date, + const std::string& upload_date, const std::string& language, + const std::string& license, const std::string& url, + const std::string& default_target_attribute, + const std::string& row_id_attribute, + const std::string& ignore_attribute, + const std::string& version_label, const std::string& citation, + std::vector tag, const std::string& visibility, + const std::string& original_data_url, const std::string& paper_url, + const std::string& update_comment, const std::string& md5_checksum, + std::vector< + std::unordered_map> + + > + param_descriptors, + std::vector> + param_qualities) + : + + m_name(name), m_description(description), + m_data_format(data_format), m_dataset_id(dataset_id), + m_version(version), m_creator(creator), + m_contributor(contributor), m_collection_date(collection_date), + m_upload_date(upload_date), m_language(language), + m_license(license), m_url(url), + m_default_target_attribute(default_target_attribute), + m_row_id_attribute(row_id_attribute), + m_ignore_attribute(ignore_attribute), + m_version_label(version_label), m_citation(citation), + m_tag(std::move(tag)), m_visibility(visibility), + m_original_data_url(original_data_url), m_paper_url(paper_url), + m_update_comment(update_comment), m_md5_checksum(md5_checksum), + m_param_descriptors(std::move(param_descriptors)), + m_param_qualities(std::move(param_qualities)) + { + } + + /** + * Creates a dataset instance from a given ID. + * + */ + static std::shared_ptr + get_dataset(const std::string& id, const std::string& api_key); + + /** + * Returns ALL the features of the dataset, potentially also the labels + * column + * @return the features + */ + std::shared_ptr get_features() noexcept; + + /** + * Returns the dataset features + * @param label_name the name of the attribute containing the label + * @return the features + */ + std::shared_ptr get_features(const std::string& label_name); + + /** + * Returns the dataset labels if m_default_target_attribute is not empty + * @return the labels + */ + std::shared_ptr get_labels(); + + /** + * Returns the dataset labels given the label_name + * @return the labels + */ + std::shared_ptr get_labels(const std::string& label_name); + + /** + * Returns the type of all attributes/features in the ARFF file + * @return + */ + SG_FORCED_INLINE std::vector + + get_feature_types() const noexcept + { + return m_feature_types; + } + + SG_FORCED_INLINE std::string + + get_default_target_attribute() const noexcept + { + return m_default_target_attribute; + } + + protected: + SG_FORCED_INLINE void set_api_key(const std::string& api_key) noexcept + { + m_api_key = api_key; + } + + private: + void get_data(); + + std::string m_name; + std::string m_description; + std::string m_data_format; + std::string m_dataset_id; + std::string m_version; + std::string m_creator; + std::string m_contributor; + std::string m_collection_date; + std::string m_upload_date; + std::string m_language; + std::string m_license; + std::string m_url; + std::string m_default_target_attribute; + std::string m_row_id_attribute; + std::string m_ignore_attribute; + std::string m_version_label; + std::string m_citation; + std::vector m_tag; + std::string m_visibility; + std::string m_original_data_url; + std::string m_paper_url; + std::string m_update_comment; + std::string m_md5_checksum; + std::vector>> + m_param_descriptors; + std::vector> + m_param_qualities; + std::string m_api_key; + + std::shared_ptr m_cached_features; + std::vector m_feature_names; + std::vector m_feature_types; + }; + +} // namespace shogun + +#endif // SHOGUN_OPENMLDATA_H diff --git a/src/shogun/io/openml/OpenMLFlow.cpp b/src/shogun/io/openml/OpenMLFlow.cpp new file mode 100644 index 00000000000..dd2f3f839a6 --- /dev/null +++ b/src/shogun/io/openml/OpenMLFlow.cpp @@ -0,0 +1,115 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#include +#include + +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + + +std::shared_ptr OpenMLFlow::download_flow( + const std::string& flow_id, const std::string& api_key) +{ + Document document; + parameters_type params; + components_type components; + std::string name; + std::string description; + std::string class_name; + + // get flow and parse with RapidJSON + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("flow_file", "json", flow_id); + document.Parse(return_string.c_str()); + check_response(document, "flow"); + + // store root for convenience. We know it exists from previous check. + const Value& root = document["flow"]; + + // handle parameters + if (root.HasMember("parameter")) + { + std::unordered_map param_dict; + + if (root["parameter"].IsArray()) + { + for (const auto& v : root["parameter"].GetArray()) + { + emplace_string_to_map(v, param_dict, "data_type"); + emplace_string_to_map(v, param_dict, "default_value"); + emplace_string_to_map(v, param_dict, "description"); + params.emplace(v["name"].GetString(), param_dict); + param_dict.clear(); + } + } + else + { + // parameter can also be a dict, instead of array + const auto v = root["parameter"].GetObject(); + emplace_string_to_map(v, param_dict, "data_type"); + emplace_string_to_map(v, param_dict, "default_value"); + emplace_string_to_map(v, param_dict, "description"); + params.emplace(v["name"].GetString(), param_dict); + } + } + + // handle components, i.e. kernels + if (root.HasMember("component")) + { + if (root["component"].IsArray()) + { + for (const auto& v : root["component"].GetArray()) + { + components.emplace( + v["identifier"].GetString(), + OpenMLFlow::download_flow( + v["flow"]["id"].GetString(), api_key)); + } + } + else + { + components.emplace( + root["component"]["identifier"].GetString(), + OpenMLFlow::download_flow( + root["component"]["flow"]["id"].GetString(), api_key)); + } + } + + // get remaining information from flow + if (root.HasMember("name")) + name = root["name"].GetString(); + if (root.HasMember("description")) + description = root["description"].GetString(); + if (root.HasMember("class_name")) + class_name = root["class_name"].GetString(); + + auto flow = std::make_shared( + name, description, class_name, components, params); + + return flow; +} + +void OpenMLFlow::upload_flow(const std::shared_ptr& flow) +{ + SG_SNOTIMPLEMENTED; +} + +void OpenMLFlow::dump() const +{ + SG_SNOTIMPLEMENTED; +} + +std::shared_ptr OpenMLFlow::from_file() +{ + SG_SNOTIMPLEMENTED; + return std::shared_ptr(); +} \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLFlow.h b/src/shogun/io/openml/OpenMLFlow.h new file mode 100644 index 00000000000..e8424e58343 --- /dev/null +++ b/src/shogun/io/openml/OpenMLFlow.h @@ -0,0 +1,142 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLFLOW_H +#define SHOGUN_OPENMLFLOW_H + +#include + +#include +#include + + +namespace shogun +{ + /** + * Handles OpenML flows. A flow contains the information + * required to instantiate a model. + */ + class OpenMLFlow + { + + public: + /** alias for component type, map of flows */ + using components_type = + std::unordered_map>; + /** alias for parameter type, map of maps with information specific to a + * parameter */ + using parameters_type = std::unordered_map< + std::string, std::unordered_map>; + + /** + * The OpenMLFlow constructor. This constructor is rarely used by the + * user and is used by the static class members download_flow and + * from_file. The user is expected to use either of the previously + * mentioned functions. + * + * @param name the model name + * @param description the model description + * @param model the flow class_name field + * @param components a map of subflows, i.e. kernels + * @param parameters a map of parameter information, i.e. default values + * for each parameter name + */ + OpenMLFlow( + const std::string& name, const std::string& description, + const std::string& model, components_type components, + parameters_type parameters) + : m_name(name), m_description(description), m_class_name(model), + m_parameters(std::move(parameters)), + m_components(std::move(components)) + { + } + + /** + * Instantiates a OpenMLFlow by downloaded a flow from the OpenML + * server. + * + * @param flow_id the flow ID + * @param api_key the user API key (might not be required and can be an + * empty string) + * @return the OpenMLFlow corresponding to the flow requested + * @throws ShogunException when there is a server error or the requested + * flow is ill formed. + */ + static std::shared_ptr + download_flow(const std::string& flow_id, const std::string& api_key); + + /** + * Instantiates a OpenMLFlow from a file. + * @return the OpenMLFlow corresponding to the flow requested + */ + static std::shared_ptr from_file(); + + /** + * Publishes a flow to the OpenML server + * @param flow the flow to be published + */ + static void upload_flow(const std::shared_ptr& flow); + + /** + * Dumps the OpenMLFlow to disk. + */ + void dump() const; + + /** + * Gets a subflow, i.e. a kernel in a machine + * @param name the name of the subflow, not the flow ID + * @return the subflow if it exists + */ + std::shared_ptr get_subflow(const std::string& name) + { + auto find_flow = m_components.find(name); + if (find_flow != m_components.end()) + return find_flow->second; + else + SG_SERROR( + "The provided subflow could not be found in this flow!") + return nullptr; + } + +#ifndef SWIG + SG_FORCED_INLINE parameters_type + + get_parameters() const noexcept + { + return m_parameters; + } + + SG_FORCED_INLINE components_type + + get_components() const noexcept + { + return m_components; + } + + SG_FORCED_INLINE std::string + + get_class_name() const noexcept + { + return m_class_name; + } + +#endif // SWIG + + private: + /** name field of the flow */ + std::string m_name; + /** description field of the flow */ + std::string m_description; + /** the class_name field of the flow */ + std::string m_class_name; + /** the parameter field of the flow (optional) */ + parameters_type m_parameters; + /** the components fields of the flow (optional) */ + components_type m_components; + }; +} // namespace shogun + +#endif // SHOGUN_OPENMLFLOW_H diff --git a/src/shogun/io/openml/OpenMLReader.cpp b/src/shogun/io/openml/OpenMLReader.cpp new file mode 100644 index 00000000000..bd9d1b7de34 --- /dev/null +++ b/src/shogun/io/openml/OpenMLReader.cpp @@ -0,0 +1,97 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#ifdef HAVE_CURL +#include +#endif // HAVE_CURL + +using namespace shogun; + +/** + * The writer callback function used to write the packets to a C++ string. + * @param data the data received in CURL request + * @param size always 1 + * @param nmemb the size of data + * @param buffer_in the buffer to write to + * @return the size of buffer that was written + */ +size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) +{ + // check that the buffer string points to something + if (buffer_in != nullptr) + { + // Append the data to the buffer + buffer_in->append(data, size * nmemb); + + return size * nmemb; + } + return 0; +} + +/* OpenML server format */ +const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; +const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::download_server = ""; +const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; + +/* DATA API */ +const char* OpenMLReader::dataset_description = "/data/{}"; +const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; +const char* OpenMLReader::data_features = "/data/features/{}"; +const char* OpenMLReader::data_qualities = "/data/qualities/{}"; +const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; +const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; +/* FLOW API */ +const char* OpenMLReader::flow_file = "/flow/{}"; +/* TASK API */ +const char* OpenMLReader::task_file = "/task/{}"; +/* SPLIT API */ +const char* OpenMLReader::get_split = "/get/{}"; + +const std::unordered_map + OpenMLReader::m_format_options = {{"xml", xml_server}, + {"json", json_server}, + {"split", splits_server}, + {"download", download_server}}; +const std::unordered_map + OpenMLReader::m_request_options = { + {"dataset_description", dataset_description}, + {"list_data_qualities", list_data_qualities}, + {"data_features", data_features}, + {"data_qualities", data_qualities}, + {"list_dataset_qualities", list_dataset_qualities}, + {"list_dataset_filter", list_dataset_filter}, + {"flow_file", flow_file}, + {"task_file", task_file}}; + +void OpenMLReader::openml_curl_request_helper(const std::string& url) +{ +#ifdef HAVE_CURL + CURL* curl_handle = nullptr; + + curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.\n") + return; + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); + + CURLcode res = curl_easy_perform(curl_handle); + + if (res != CURLE_OK) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) + + curl_easy_cleanup(curl_handle); +#endif // HAVE_CURL +} diff --git a/src/shogun/io/openml/OpenMLReader.h b/src/shogun/io/openml/OpenMLReader.h new file mode 100644 index 00000000000..8078b1e78ec --- /dev/null +++ b/src/shogun/io/openml/OpenMLReader.h @@ -0,0 +1,183 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLREADER_H +#define SHOGUN_OPENMLREADER_H + +#include +#include + +#include +#include +#include +#include + +namespace shogun +{ + /** + * Reads OpenML streams which can be downloaded with this function. + */ + class OpenMLReader + { + + public: + explicit OpenMLReader(const std::string& api_key) : m_api_key(api_key) + { + } + + /** + * Returns a string returned by the server given a request. + * Raises an error if the returned code is not 200. + * Additional arguments can be passed to the request, + * which are then concatenated with a "/" character. + * + * @tparam Args argument type pack, should all be std::string + * @param request the request name, see m_request_options + * @param format the format to return the data in, see m_format_options + * @param args the additional arguments to be passed to request + * @return the returned stream from the server if the return code is 200 + */ + template + std::string + get(const std::string& request, const std::string& format, Args... args) + { + std::string request_path; + auto find_format = m_format_options.find(format); + if (find_format == m_format_options.end()) + { + SG_SERROR( + "The provided format \"%s\" is not available\n", + format.c_str()) + } + + if (format == "split") + { + REQUIRE( + request == "get_split", + "Split server can only handle \"get_split\" request.\n") + request_path = get_split; + } + else + { + auto find_request = m_request_options.find(request); + if (find_request == m_request_options.end()) + { + SG_SERROR( + "Could not find a way to solve the request \"%s\"\n", + request.c_str()) + } + request_path = find_request->second; + } + + std::string request_format = find_format->second; + + // get additional args and concatenate them with "/" + if (sizeof...(Args) > 0) + { + if (request_path.substr(request_path.size() - 2) == "{}") + { + request_path = + request_path.substr(0, request_path.size() - 2); + } + else + { + SG_SERROR( + "The provided request \"%s\" cannot handle additional " + "args.\n", + request.c_str()) + } + std::vector args_vec = {args...}; + std::string args_string = std::accumulate( + args_vec.begin() + 1, args_vec.end(), args_vec.front(), + [](std::string s0, std::string& s1) { + return s0 += "/" + s1; + }); + request_path += args_string; + } + + std::string url = request_format + request_path + "?" + m_api_key; + + return get(url); + } + + std::string get(const std::string& url) + { +#ifdef HAVE_CURL + // clear the buffer before request + m_curl_response_buffer.clear(); + + openml_curl_request_helper(url); + return m_curl_response_buffer; +#else + SG_SERROR( + "This function is only available with the CURL library!\n") +#endif // HAVE_CURL + } + + private: + /** the raw buffer as a C++ string */ + std::string m_curl_response_buffer; + + /** + * Initialises CURL session and gets the data. + * This function also handles the response code from the server. + * + * @param url the url to query + */ + void openml_curl_request_helper(const std::string& url); + + /** the user API key, not required for all requests */ + std::string m_api_key; + + /** the server path to get a response in XML format*/ + static const char* xml_server; + /** the server path to get a response in JSON format*/ + static const char* json_server; + /** the server path to download datasets */ + static const char* download_server; + /** the server path to get a split in ARFF format */ + static const char* splits_server; + + /** the server response format options: XML or JSON */ + static const std::unordered_map + m_format_options; + /** all the supported server options */ + static const std::unordered_map + m_request_options; + + /* DATA API */ + static const char* dataset_description; + static const char* list_data_qualities; + static const char* data_features; + static const char* data_qualities; + static const char* list_dataset_qualities; + static const char* list_dataset_filter; + + /* FLOW API */ + static const char* flow_file; + + /* TASK API */ + static const char* task_file; + + /* SPLIT API */ + static const char* get_split; + }; + + /** + * Writes OpenML streams to the OpenML server. + */ + class OpenMLWritter + { + public: + OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; + + private: + /** the user API key, likely to be needed to write to OpenML */ + std::string m_api_key; + }; +} // namespace shogun + +#endif // SHOGUN_OPENMLREADER_H diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp new file mode 100644 index 00000000000..796545e10de --- /dev/null +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -0,0 +1,68 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include +#include + +using namespace shogun; + +std::shared_ptr OpenMLRun::run_model_on_task( + std::shared_ptr model, std::shared_ptr task) +{ + SG_SNOTIMPLEMENTED + return std::shared_ptr(); +} + +std::shared_ptr OpenMLRun::run_flow_on_task( + std::shared_ptr flow, std::shared_ptr task) +{ + auto data = task->get_dataset(); + std::shared_ptr train_features = nullptr, test_features = nullptr; + std::shared_ptr train_labels = nullptr, test_labels = nullptr; + + if (task->get_split()->contains_splits()) + SG_SNOTIMPLEMENTED + else + { + train_labels = data->get_labels(); + train_features = + data->get_features(data->get_default_target_attribute()); + // ensures delete is called by shared ptr destructor + SG_REF(train_labels.get()) + SG_REF(train_features.get()) + auto model = ShogunOpenML::flow_to_model(std::move(flow), true); + + if (auto machine = std::dynamic_pointer_cast(model)) + { + auto result = ShogunOpenML::run_model_on_fold( + machine, task, train_features, 0, 0, train_labels, + test_features); + SG_SDEBUG(result->to_string().c_str()); + } + else + SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") + } + return std::shared_ptr(); +} + +std::shared_ptr +OpenMLRun::from_filesystem(const std::string& directory) +{ + SG_SNOTIMPLEMENTED + return nullptr; +} + +void OpenMLRun::to_filesystem(const std::string& directory) const +{ + SG_SNOTIMPLEMENTED +} + +void OpenMLRun::publish() const +{ + SG_SNOTIMPLEMENTED +} diff --git a/src/shogun/io/openml/OpenMLRun.h b/src/shogun/io/openml/OpenMLRun.h new file mode 100644 index 00000000000..9046d09bea9 --- /dev/null +++ b/src/shogun/io/openml/OpenMLRun.h @@ -0,0 +1,78 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLRUN_H +#define SHOGUN_OPENMLRUN_H + +#include + +#include +#include + +namespace shogun { + class OpenMLRun + { + public: + OpenMLRun( + const std::string& uploader, const std::string& uploader_name, + const std::string& setup_id, const std::string& setup_string, + const std::string& parameter_settings, + std::vector evaluations, + std::vector fold_evaluations, + std::vector sample_evaluations, + const std::string& data_content, + std::vector output_files, + std::shared_ptr task, std::shared_ptr flow, + const std::string& run_id, std::shared_ptr model, + std::vector tags, std::string predictions_url) + : m_uploader(uploader), m_uploader_name(uploader_name), + m_setup_id(setup_id), m_setup_string(setup_string), + m_parameter_settings(parameter_settings), + m_evaluations(std::move(evaluations)), + m_fold_evaluations(std::move(fold_evaluations)), + m_sample_evaluations(std::move(sample_evaluations)), + m_data_content(data_content), + m_output_files(std::move(output_files)), m_task(std::move(task)), + m_flow(std::move(flow)), m_run_id(run_id), + m_model(std::move(model)), m_tags(std::move(tags)), + m_predictions_url(std::move(predictions_url)) + { + } + + static std::shared_ptr + from_filesystem(const std::string& directory); + + static std::shared_ptr run_flow_on_task( + std::shared_ptr flow, std::shared_ptr task); + + static std::shared_ptr run_model_on_task( + std::shared_ptr model, std::shared_ptr task); + + void to_filesystem(const std::string& directory) const; + + void publish() const; + + private: + std::string m_uploader; + std::string m_uploader_name; + std::string m_setup_id; + std::string m_setup_string; + std::string m_parameter_settings; + std::vector m_evaluations; + std::vector m_fold_evaluations; + std::vector m_sample_evaluations; + std::string m_data_content; + std::vector m_output_files; + std::shared_ptr m_task; + std::shared_ptr m_flow; + std::string m_run_id; + std::shared_ptr m_model; + std::vector m_tags; + std::string m_predictions_url; + }; +} + +#endif //SHOGUN_OPENMLRUN_H diff --git a/src/shogun/io/openml/OpenMLSplit.cpp b/src/shogun/io/openml/OpenMLSplit.cpp new file mode 100644 index 00000000000..c4481a860cd --- /dev/null +++ b/src/shogun/io/openml/OpenMLSplit.cpp @@ -0,0 +1,112 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +#include +#include + +using namespace shogun; + +std::shared_ptr +OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) +{ + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("get_split", "split", split_url); + + if (return_string == "Task not providing datasplits.") + return std::make_shared(); + + auto return_stream = std::istringstream(return_string); + // TODO: add ARFF parsing here + // get train/test indices + // TODO: replace line below with ARFFDeserialiser::get_features() + auto arff_features = std::make_shared(); + REQUIRE( + arff_features->get_num_feature_obj() == 4, + "Expected a ARFF file with 4 attributes: type, rowid, repeat and " + "fold.\n") + + auto train_test_feat = + std::shared_ptr(arff_features->get_feature_obj(0)); + auto rowid_feat = + std::shared_ptr(arff_features->get_feature_obj(1)); + auto repeat_feat = + std::shared_ptr(arff_features->get_feature_obj(2)); + auto fold_feat = + std::shared_ptr(arff_features->get_feature_obj(3)); + + auto type_vector = string_feature_to_vector(train_test_feat); + auto rowid_vector = dense_feature_to_vector(rowid_feat); + auto repeat_vector = dense_feature_to_vector(repeat_feat); + auto fold_vector = dense_feature_to_vector(fold_feat); + + std::vector> train_idx, test_idx; + for (int i = 0; i < arff_features->get_num_vectors(); ++i) + { + if (type_vector[i] == LabelType::TRAIN) + train_idx.emplace_back(std::initializer_list{ + static_cast(rowid_vector[i]), + static_cast(repeat_vector[i]), + static_cast(fold_vector[i])}); + else + test_idx.emplace_back(std::initializer_list{ + static_cast(rowid_vector[i]), + static_cast(repeat_vector[i]), + static_cast(fold_vector[i])}); + } + + return std::make_shared(train_idx, test_idx); +} + +SGVector +OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = + std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CDenseFeatures!\n>"); + return casted_feat->get_feature_vector(0); +} + +std::vector +OpenMLSplit::string_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CStringFeatures!\n"); + + auto to_lower = [](const std::string& line) { + std::string result; + std::transform( + line.begin(), line.end(), std::back_inserter(result), + [](uint8_t val) { return std::tolower(val); }); + return result; + }; + + std::vector result; + + for (int i = 0; i < casted_feat->get_num_vectors(); ++i) + { + auto row = casted_feat->get_feature_vector(i); + std::string label(1, row[0]); + for (auto j = 1; j < casted_feat->get_max_vector_length(); ++j) + label.append(1, row[j]); + if (to_lower(label) == "train") + result.push_back(LabelType::TRAIN); + else if (to_lower(label) == "test") + result.push_back(LabelType::TEST); + else + SG_SERROR("Unknown label type in split file %s!\n", label.c_str()) + } + return result; +} \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLSplit.h b/src/shogun/io/openml/OpenMLSplit.h new file mode 100644 index 00000000000..ab61aebdd77 --- /dev/null +++ b/src/shogun/io/openml/OpenMLSplit.h @@ -0,0 +1,71 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLSPLIT_H +#define SHOGUN_OPENMLSPLIT_H + +#include +#include + +namespace shogun +{ + /** + * Handles an OpenML split. + */ + class OpenMLSplit + { + public: + enum class LabelType + { + TRAIN = 1, + TEST = 2 + }; + + /** + * Default constructor. This is used when there are no + * train or test indices. + */ + OpenMLSplit() = default; + + OpenMLSplit( + std::vector> train_idx, + std::vector> test_idx) + : m_train_idx(std::move(train_idx)), m_test_idx(std::move(test_idx)) + { + } + + static std::shared_ptr + get_split(const std::string& split_url, const std::string& api_key); + + SG_FORCED_INLINE std::vector> get_train_idx() const + noexcept + { + return m_train_idx; + } + + SG_FORCED_INLINE std::vector> get_test_idx() const + noexcept + { + return m_test_idx; + } + + SG_FORCED_INLINE bool contains_splits() const noexcept + { + return !m_train_idx.empty() && !m_test_idx.empty(); + } + + private: + static SGVector + dense_feature_to_vector(const std::shared_ptr& feat); + + static std::vector + string_feature_to_vector(const std::shared_ptr& feat); + + std::vector> m_train_idx; + std::vector> m_test_idx; + }; +} // namespace shogun +#endif // SHOGUN_OPENMLSPLIT_H diff --git a/src/shogun/io/openml/OpenMLTask.cpp b/src/shogun/io/openml/OpenMLTask.cpp new file mode 100644 index 00000000000..493664a8367 --- /dev/null +++ b/src/shogun/io/openml/OpenMLTask.cpp @@ -0,0 +1,137 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + +std::shared_ptr +OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) +{ + Document document; + std::string task_name; + std::string task_type_id; + std::shared_ptr openml_dataset = nullptr; + std::shared_ptr openml_split = nullptr; + std::unordered_map evaluation_measures; + + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("task_file", "json", task_id); + + document.Parse(return_string.c_str()); + check_response(document, "task"); + + const Value& root = document["task"]; + + REQUIRE( + task_id == root["task_id"].GetString(), + "Expected downloaded task to have the same id as the requested task " + "id, but got \"%s\", instead of \"%s\".\n", + root["task_id"].GetString(), task_id.c_str()) + + task_name = root["task_name"].GetString(); + OpenMLTask::TaskType task_type = + get_task_from_string(root["task_type"].GetString()); + task_type_id = root["task_type_id"].GetString(); + + // expect two elements in input array: dataset and split + const Value& json_input = root["input"]; + + auto input_array = json_input.GetArray(); + + for (const auto& task_settings : input_array) + { + if (strcmp(task_settings["name"].GetString(), "source_data") == 0) + { + auto dataset_info = task_settings["data_set"].GetObject(); + std::string dataset_id = dataset_info["data_set_id"].GetString(); + std::string target_feature = + dataset_info["target_feature"].GetString(); + openml_dataset = OpenMLData::get_dataset(dataset_id, api_key); + } + else if ( + strcmp(task_settings["name"].GetString(), "estimation_procedure") == + 0) + { + auto split_info = task_settings["estimation_procedure"].GetObject(); + std::string split_id = split_info["id"].GetString(); + std::string split_type = split_info["type"].GetString(); + std::string split_url = split_info["data_splits_url"].GetString(); + std::unordered_map split_parameters; + for (const auto& param : split_info["parameter"].GetArray()) + { + if (param.HasMember("name") && param.HasMember("value")) + split_parameters.emplace( + param["name"].GetString(), param["value"].GetString()); + else if (param.HasMember("name")) + split_parameters.emplace(param["name"].GetString(), ""); + else + SG_SERROR( + "Unexpected number of parameters in parameter array " + "of estimation_procedure.\n") + } + REQUIRE( + split_type == "crossvalidation", + "Currently only tasks with cross validation are enabled in " + "shogun!\n") + openml_split = OpenMLSplit::get_split(split_url, api_key); + } + else if ( + strcmp(task_settings["name"].GetString(), "evaluation_measures") == + 0) + { + auto evaluation_info = + task_settings["evaluation_measures"].GetObject(); + for (const auto& param : evaluation_info) + { + if (param.value.IsString()) + evaluation_measures.emplace( + param.name.GetString(), param.value.GetString()); + else + evaluation_measures.emplace(param.name.GetString(), ""); + } + } + } + + if (openml_dataset == nullptr && openml_split == nullptr) + SG_SERROR("Error parsing task.\n") + + auto result = std::make_shared( + task_id, task_name, task_type, task_type_id, evaluation_measures, + openml_split, openml_dataset); + + return result; +} + +OpenMLTask::TaskType +OpenMLTask::get_task_from_string(const std::string& task_type) +{ + if (task_type == "Supervised Classification") + return OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION; + SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) +} + +std::vector> OpenMLTask::get_train_indices() const +{ + return get_indices(m_split->get_train_idx()); +} + +std::vector> OpenMLTask::get_test_indices() const +{ + return get_indices(m_split->get_test_idx()); +} + +std::vector> +OpenMLTask::get_indices(const std::vector>& idx) const +{ + SG_SNOTIMPLEMENTED + std::vector> result; + return result; +} \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLTask.h b/src/shogun/io/openml/OpenMLTask.h new file mode 100644 index 00000000000..a849ea37de9 --- /dev/null +++ b/src/shogun/io/openml/OpenMLTask.h @@ -0,0 +1,95 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLTASK_H +#define SHOGUN_OPENMLTASK_H + +#include +#include + +namespace shogun +{ + /** + * Handles OpenML tasks. A task contains all the information + * required to train and test a model. + */ + class OpenMLTask + { + public: + enum class TaskType + { + SUPERVISED_CLASSIFICATION = 0, + SUPERVISED_REGRESSION = 1, + LEARNING_CURVE = 2, + SUPERVISED_DATASTREAM_CLASSIFICATION = 3, + CLUSTERING = 4, + MACHINE_LEARNING_CHALLENGE = 5, + SURVIVAL_ANALYSIS = 6, + SUBGROUP_DISCOVERY = 7 + }; + + enum class TaskEvaluation + { + + }; + + OpenMLTask( + const std::string& task_id, const std::string task_name, + TaskType task_type, const std::string& task_type_id, + std::unordered_map evaluation_measures, + std::shared_ptr split, + std::shared_ptr data) + : m_task_id(task_id), m_task_name(task_name), + m_task_type(task_type), m_task_type_id(task_type_id), + m_evaluation_measures(std::move(evaluation_measures)), + m_split(std::move(split)), m_data(std::move(data)) + { + } + + static std::shared_ptr + get_task(const std::string& task_id, const std::string& api_key); + + std::shared_ptr get_dataset() const noexcept + { + return m_data; + } + + std::shared_ptr get_split() const noexcept + { + return m_split; + } + + std::vector> get_train_indices() const; + + std::vector> get_test_indices() const; + +#ifndef SWIG + SG_FORCED_INLINE TaskType + + get_task_type() const noexcept + { + return m_task_type; + } + +#endif // SWIG + + private: + static TaskType get_task_from_string(const std::string& task_type); + + std::vector> + get_indices(const std::vector>& idx) const; + + std::string m_task_id; + std::string m_task_name; + TaskType m_task_type; + std::string m_task_type_id; + std::unordered_map m_evaluation_measures; + std::shared_ptr m_split; + std::shared_ptr m_data; + }; +} // namespace shogun + +#endif // SHOGUN_OPENMLTASK_H diff --git a/src/shogun/io/openml/ShogunOpenML.cpp b/src/shogun/io/openml/ShogunOpenML.cpp new file mode 100644 index 00000000000..80dbc03810f --- /dev/null +++ b/src/shogun/io/openml/ShogunOpenML.cpp @@ -0,0 +1,340 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#include + +using namespace shogun; + +/** + * Class using the Any visitor pattern to convert + * a string to a C++ type that can be used as a parameter + * in a Shogun model. If the string value is not "null" it will + * be put in its casted type in the given model with the provided parameter + * name. If the value is null nothing happens, i.e. no error is thrown + * and no value is put in model. + */ +class StringToShogun : public AnyVisitor +{ +public: + explicit StringToShogun(std::shared_ptr model) + : m_model(model), m_parameter(""), m_string_val(""){}; + + StringToShogun( + std::shared_ptr model, const std::string& parameter, + const std::string& string_val) + : m_model(model), m_parameter(parameter), m_string_val(string_val){}; + + void on(bool* v) final + { + SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + bool result = strcmp(m_string_val.c_str(), "true") == 0; + m_model->put(m_parameter, result); + } + } + void on(int32_t* v) final + { + SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + try + { + int32_t result = std::stoi(m_string_val); + m_model->put(m_parameter, result); + } + catch (const std::invalid_argument&) + { + // it's an option, i.e. internally represented + // as an enum but in swig exposed as a string + m_string_val.erase( + std::remove_if( + m_string_val.begin(), m_string_val.end(), + // remove quotes + [](const auto& val) { return val == '\"'; }), + m_string_val.end()); + m_model->put(m_parameter, m_string_val); + } + } + } + void on(int64_t* v) final + { + SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + + int64_t result = std::stol(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(float* v) final + { + SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + float32_t result = std::stof(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(double* v) final + { + SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + float64_t result = std::stod(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(long double* v) + { + SG_SDEBUG( + "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + floatmax_t result = std::stold(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(CSGObject** v) final + { + SG_SDEBUG( + "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) + } + void on(SGMatrix* mat) final{SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} + + /** + * In OpenML "null" is an empty parameter value field. + * @return whether the field is "null" + */ + SG_FORCED_INLINE bool is_null() const noexcept + { + bool result = strcmp(m_string_val.c_str(), "null") == 0; + return result; + } + + SG_FORCED_INLINE void set_parameter_name(const std::string& name) noexcept + { + m_parameter = name; + } + + SG_FORCED_INLINE void set_string_value(const std::string& value) noexcept + { + m_string_val = value; + } + +private: + std::shared_ptr m_model; + std::string m_parameter; + std::string m_string_val; +}; + +/** + * Instantiates a CSGObject using a factory + * @param factory_name the name of the factory + * @param algo_name the name of algorithm passed to factory + * @return the instantiated object using a factory + */ +std::shared_ptr instantiate_model_from_factory( + const std::string& factory_name, const std::string& algo_name) +{ + if (factory_name == "machine") + return std::shared_ptr(machine(algo_name)); + if (factory_name == "kernel") + return std::shared_ptr(kernel(algo_name)); + if (factory_name == "distance") + return std::shared_ptr(distance(algo_name)); + + SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + + return nullptr; +} + +/** + * Downcasts a CSGObject and puts it in the map of obj. + * @param obj the main object + * @param nested_obj the object to be casted and put in the obj map. + * @param parameter_name the name of nested_obj + */ +void cast_and_put( + const std::shared_ptr& obj, + const std::shared_ptr& nested_obj, + const std::string& parameter_name) +{ + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + // TODO: remove clone + // temporary fix until shared_ptr PR merged + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + return; + } + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + return; + } + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + return; + } + SG_SERROR("Could not cast SGObject.\n") +} + +std::shared_ptr ShogunOpenML::flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults) +{ + auto params = flow->get_parameters(); + auto components = flow->get_components(); + auto class_name = get_class_info(flow->get_class_name()); + auto module_name = class_name.first; + auto algo_name = class_name.second; + + auto obj = instantiate_model_from_factory(module_name, algo_name); + auto obj_param = obj->get_params(); + + auto visitor = std::make_unique(obj); + + if (initialize_with_defaults) + { + for (const auto& param : params) + { + Any any_val = obj_param.at(param.first)->get_value(); + std::string name = param.first; + std::string val_as_string = param.second.at("default_value"); + visitor->set_parameter_name(name); + visitor->set_string_value(val_as_string); + any_val.visit(visitor.get()); + } + } + + for (const auto& component : components) + { + std::shared_ptr nested_obj = + flow_to_model(component.second, initialize_with_defaults); + cast_and_put(obj, nested_obj, component.first); + } + + SG_SDEBUG("Final object: %s.\n", obj->to_string().c_str()); + + return obj; +} + +std::shared_ptr +ShogunOpenML::model_to_flow(const std::shared_ptr& model) +{ + return std::shared_ptr(); +} + +std::pair +ShogunOpenML::get_class_info(const std::string& class_name) +{ + std::vector class_components; + auto begin = class_name.begin(); + std::pair result; + + for (auto it = class_name.begin(); it != class_name.end(); ++it) + { + if (*it == '.') + { + class_components.emplace_back(std::string(begin, it)); + begin = std::next(it); + } + if (std::next(it) == class_name.end()) + class_components.emplace_back(std::string(begin, std::next(it))); + } + + if (class_components[0] == "shogun" && class_components.size() == 3) + result = std::make_pair(class_components[1], class_components[2]); + else if (class_components[0] == "shogun" && class_components.size() != 3) + SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) + else + SG_SERROR( + "The provided flow is not meant for shogun deserialisation! The " + "required library is \"%s\".\n", + class_components[0].c_str()) + + return result; +} + +std::shared_ptr ShogunOpenML::run_model_on_fold( + const std::shared_ptr& model, + const std::shared_ptr& task, + const std::shared_ptr& X_train, index_t repeat_number, + index_t fold_number, const std::shared_ptr& y_train, + const std::shared_ptr& X_test) +{ + auto task_type = task->get_task_type(); + auto model_clone = std::shared_ptr(model->clone()); + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + if (auto machine = std::dynamic_pointer_cast(model_clone)) + { + // TODO: refactor. more useless clones until smart pointers are merged + machine->put("labels", y_train->clone()->as()); + auto tmp = X_train.get(); + machine->train(tmp); + if (X_test) + return std::shared_ptr(machine->apply(X_test.get())); + else + return std::shared_ptr(machine->apply(X_train.get())); + } + else + SG_SERROR("The provided model is not a trainable machine!\n") + } + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + return nullptr; +} \ No newline at end of file diff --git a/src/shogun/io/openml/ShogunOpenML.h b/src/shogun/io/openml/ShogunOpenML.h new file mode 100644 index 00000000000..ff5257f19b8 --- /dev/null +++ b/src/shogun/io/openml/ShogunOpenML.h @@ -0,0 +1,70 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_SHOGUNOPENML_H +#define SHOGUN_SHOGUNOPENML_H + +#include + +#include +#include + +namespace shogun +{ + class OpenMLRun; + /** + * The Shogun OpenML extension to run models from an OpenMLFlow + * and convert models to OpenMLFlow. + */ + class ShogunOpenML + { + public: + friend class OpenMLRun; + /** + * Instantiates a SGObject from an OpenMLFlow. + * + * @param flow the flow to instantiate + * @param initialize_with_defaults whether to use the default values + * specified in the flow + * @return the flow as a trainable model + */ + static std::shared_ptr flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults); + + /** + * Converts a SGObject to an OpenMLFlow. + * + * @param model the model to convert + * @return the flow from the model conversion + */ + static std::shared_ptr + model_to_flow(const std::shared_ptr& model); + + protected: + static std::shared_ptr run_model_on_fold( + const std::shared_ptr& model, + const std::shared_ptr& task, + const std::shared_ptr& X_train, index_t repeat_number, + index_t fold_number, const std::shared_ptr& y_train, + const std::shared_ptr& X_test); + + private: + /** + * Helper function to extract module/factory information from the + * class name field of OpenMLFlow. Throws an error either if the + * class name field is ill formed (i.e. not + * library.module.algorithm) or if the library name is not "shogun". + * + * @param class_name the flow class_name field + * @return a tuple with the module name (factory string) and the + * algorithm name + */ + static std::pair + get_class_info(const std::string& class_name); + }; +} // namespace shogun + +#endif // SHOGUN_SHOGUNOPENML_H diff --git a/src/shogun/io/openml/utils.h b/src/shogun/io/openml/utils.h new file mode 100644 index 00000000000..a51da80857e --- /dev/null +++ b/src/shogun/io/openml/utils.h @@ -0,0 +1,131 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENML_UTILS_H +#define SHOGUN_OPENML_UTILS_H + +#include + +#include + +namespace shogun +{ + namespace openml_detail + { + /** + * Checks the returned response from OpenML in JSON format + * @param doc the parsed OpenML JSON format response + */ + static void + check_response(const rapidjson::Document& doc, const std::string& type) + { + if (SG_UNLIKELY(doc.HasMember("error"))) + { + const rapidjson::Value& root = doc["error"]; + SG_SERROR( + "Server error %s: %s\n", root["code"].GetString(), + root["message"].GetString()) + return; + } + REQUIRE( + doc.HasMember(type.c_str()), + "Unexpected format of OpenML %s.\n", type.c_str()); + } + + /** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericValue, i.e. string + * @param param_dict the map to write to + * @param name the name of the key + */ + static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericValue>& v, + std::unordered_map& param_dict, + const std::string& name) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); + } + + /** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericObject, i.e. array + * @param param_dict the map to write to + * @param name the name of the key + */ + static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v, + std::unordered_map& param_dict, + const std::string& name) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); + } + + template + SG_FORCED_INLINE T return_if_possible( + const std::string& name, + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v) + { + SG_SNOTIMPLEMENTED + } + + template <> + SG_FORCED_INLINE std::string return_if_possible( + const std::string& name, + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v) + { + if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) + return v[name.c_str()].GetString(); + if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) + SG_SERROR( + "Found member \"%s\" but it is not a string", name.c_str()) + if (!v.HasMember(name.c_str())) + return ""; + SG_SERROR( + "\"%s\" is not a member of the given object", name.c_str()) + return nullptr; + } + + template <> + SG_FORCED_INLINE std::vector + return_if_possible>( + const std::string& name, + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v) + { + std::vector result; + if (!v.HasMember(name.c_str())) + SG_SERROR( + "\"%s\" is not a member of the given object", name.c_str()) + if (v[name.c_str()].IsString()) + { + result.emplace_back(v[name.c_str()].GetString()); + } + if (v[name.c_str()].IsArray()) + { + for (const auto& val : v[name.c_str()].GetArray()) + { + if (val.IsString()) + result.emplace_back(val.GetString()); + else + SG_SERROR( + "Found non string member in \"%s\".\n", + name.c_str()) + } + } + return result; + } + } // namespace openml_detail +} // namespace shogun +#endif // SHOGUN_OPENML_UTILS_H From 40fda20fd5239d2fe1b6f9d5502f2a5ab8568d81 Mon Sep 17 00:00:00 2001 From: gf712 Date: Sat, 18 May 2019 18:18:11 +0100 Subject: [PATCH 12/32] train and test index getter --- src/shogun/io/openml/OpenMLReader.cpp | 2 +- src/shogun/io/openml/OpenMLSplit.cpp | 70 +++++++++++------------- src/shogun/io/openml/OpenMLSplit.h | 76 ++++++++++++++++++++++----- src/shogun/io/openml/OpenMLTask.cpp | 23 +++++--- src/shogun/io/openml/OpenMLTask.h | 8 +-- src/shogun/io/openml/utils.h | 6 ++- 6 files changed, 121 insertions(+), 64 deletions(-) diff --git a/src/shogun/io/openml/OpenMLReader.cpp b/src/shogun/io/openml/OpenMLReader.cpp index bd9d1b7de34..3751e686d19 100644 --- a/src/shogun/io/openml/OpenMLReader.cpp +++ b/src/shogun/io/openml/OpenMLReader.cpp @@ -51,7 +51,7 @@ const char* OpenMLReader::flow_file = "/flow/{}"; /* TASK API */ const char* OpenMLReader::task_file = "/task/{}"; /* SPLIT API */ -const char* OpenMLReader::get_split = "/get/{}"; +const char* OpenMLReader::get_split = ""; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, diff --git a/src/shogun/io/openml/OpenMLSplit.cpp b/src/shogun/io/openml/OpenMLSplit.cpp index c4481a860cd..19569137e0d 100644 --- a/src/shogun/io/openml/OpenMLSplit.cpp +++ b/src/shogun/io/openml/OpenMLSplit.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -17,16 +18,16 @@ std::shared_ptr OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) { auto reader = OpenMLReader(api_key); - auto return_string = reader.get("get_split", "split", split_url); + auto return_string = reader.get(split_url); if (return_string == "Task not providing datasplits.") return std::make_shared(); - auto return_stream = std::istringstream(return_string); - // TODO: add ARFF parsing here - // get train/test indices - // TODO: replace line below with ARFFDeserialiser::get_features() - auto arff_features = std::make_shared(); + std::shared_ptr return_stream = + std::make_shared(return_string); + auto arff_parser = ARFFDeserializer(return_stream); + arff_parser.read(); + auto arff_features = arff_parser.get_features(); REQUIRE( arff_features->get_num_feature_obj() == 4, "Expected a ARFF file with 4 attributes: type, rowid, repeat and " @@ -41,30 +42,33 @@ OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) auto fold_feat = std::shared_ptr(arff_features->get_feature_obj(3)); - auto type_vector = string_feature_to_vector(train_test_feat); + auto type_vector = nominal_feature_to_vector(train_test_feat); auto rowid_vector = dense_feature_to_vector(rowid_feat); auto repeat_vector = dense_feature_to_vector(repeat_feat); auto fold_vector = dense_feature_to_vector(fold_feat); - std::vector> train_idx, test_idx; - for (int i = 0; i < arff_features->get_num_vectors(); ++i) + std::array, 3> train_idx, test_idx; + + for (int i = 0; i < train_test_feat->get_num_vectors(); ++i) { if (type_vector[i] == LabelType::TRAIN) - train_idx.emplace_back(std::initializer_list{ - static_cast(rowid_vector[i]), - static_cast(repeat_vector[i]), - static_cast(fold_vector[i])}); + { + train_idx[0].push_back(rowid_vector[i]); + train_idx[1].push_back(repeat_vector[i]); + train_idx[2].push_back(fold_vector[i]); + } else - test_idx.emplace_back(std::initializer_list{ - static_cast(rowid_vector[i]), - static_cast(repeat_vector[i]), - static_cast(fold_vector[i])}); + { + test_idx[0].push_back(rowid_vector[i]); + test_idx[1].push_back(repeat_vector[i]); + test_idx[2].push_back(fold_vector[i]); + } } return std::make_shared(train_idx, test_idx); } -SGVector +SGMatrix OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) { auto casted_feat = @@ -73,40 +77,28 @@ OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) if (!casted_feat) SG_SERROR("Error casting a column in the split file from CFeatures to " "CDenseFeatures!\n>"); - return casted_feat->get_feature_vector(0); + return casted_feat->get_feature_matrix(); } std::vector -OpenMLSplit::string_feature_to_vector(const std::shared_ptr& feat) +OpenMLSplit::nominal_feature_to_vector(const std::shared_ptr& feat) { - auto casted_feat = std::dynamic_pointer_cast>(feat); + auto casted_feat = + std::dynamic_pointer_cast>(feat); // this should never happen if (!casted_feat) SG_SERROR("Error casting a column in the split file from CFeatures to " - "CStringFeatures!\n"); - - auto to_lower = [](const std::string& line) { - std::string result; - std::transform( - line.begin(), line.end(), std::back_inserter(result), - [](uint8_t val) { return std::tolower(val); }); - return result; - }; + "CDenseFeatures!\n"); std::vector result; - for (int i = 0; i < casted_feat->get_num_vectors(); ++i) + for (const auto& el : casted_feat->get_feature_matrix()) { - auto row = casted_feat->get_feature_vector(i); - std::string label(1, row[0]); - for (auto j = 1; j < casted_feat->get_max_vector_length(); ++j) - label.append(1, row[j]); - if (to_lower(label) == "train") + if (el == 0) result.push_back(LabelType::TRAIN); - else if (to_lower(label) == "test") - result.push_back(LabelType::TEST); else - SG_SERROR("Unknown label type in split file %s!\n", label.c_str()) + result.push_back(LabelType::TEST); } + return result; } \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLSplit.h b/src/shogun/io/openml/OpenMLSplit.h index ab61aebdd77..f032ee91647 100644 --- a/src/shogun/io/openml/OpenMLSplit.h +++ b/src/shogun/io/openml/OpenMLSplit.h @@ -10,6 +10,8 @@ #include #include +#include + namespace shogun { /** @@ -28,44 +30,92 @@ namespace shogun * Default constructor. This is used when there are no * train or test indices. */ - OpenMLSplit() = default; + OpenMLSplit() : m_repeat_count(0), m_fold_count(0) + { + } OpenMLSplit( - std::vector> train_idx, - std::vector> test_idx) - : m_train_idx(std::move(train_idx)), m_test_idx(std::move(test_idx)) + const std::array, 3>& train_idx, + const std::array, 3>& test_idx) + : m_train_idx(train_idx), m_test_idx(test_idx) { + // repeats and folds are zero indexed so add 1 + // we also assume that the repeats and folds indices go from + // 0,1,...,N in increments of 1 + m_data_count = + std::make_pair(train_idx[0].size(), test_idx[0].size()); + m_repeat_count = + *std::max_element(train_idx[1].begin(), train_idx[1].end()) + 1; + m_fold_count = + *std::max_element(train_idx[2].begin(), train_idx[2].end()) + 1; + auto test_repeat_count = + *std::max_element(test_idx[1].begin(), test_idx[1].end()) + 1; + auto test_fold_count = + *std::max_element(test_idx[2].begin(), test_idx[2].end()) + 1; + + REQUIRE( + train_idx[0].size() == train_idx[1].size() && + train_idx[0].size() == train_idx[2].size(), + "All dimensions in train_idx must match!\n") + REQUIRE( + test_idx[0].size() == test_idx[1].size() && + test_idx[0].size() == test_idx[2].size(), + "All dimensions in test_idx must match!\n") + + if (m_repeat_count != test_repeat_count) + SG_SERROR( + "Expected the train and test set to have the same number " + "of repeats, but got %d and %d respectively.\n", + m_repeat_count, test_repeat_count) + if (m_repeat_count != test_repeat_count) + SG_SERROR( + "Expected the train and test set to have the same number " + "of folds, but got %d and %d respectively.\n", + m_fold_count, test_fold_count) } static std::shared_ptr get_split(const std::string& split_url, const std::string& api_key); - SG_FORCED_INLINE std::vector> get_train_idx() const - noexcept + SG_FORCED_INLINE std::array, 3> + get_train_idx() const noexcept { return m_train_idx; } - SG_FORCED_INLINE std::vector> get_test_idx() const - noexcept + SG_FORCED_INLINE std::array, 3> + get_test_idx() const noexcept { return m_test_idx; } SG_FORCED_INLINE bool contains_splits() const noexcept { - return !m_train_idx.empty() && !m_test_idx.empty(); + return !m_train_idx[0].empty() && !m_test_idx[0].empty(); + } + + SG_FORCED_INLINE int32_t get_num_repeats() const noexcept + { + return m_repeat_count; + } + + SG_FORCED_INLINE int32_t get_num_folds() const noexcept + { + return m_fold_count; } private: - static SGVector + static SGMatrix dense_feature_to_vector(const std::shared_ptr& feat); static std::vector - string_feature_to_vector(const std::shared_ptr& feat); + nominal_feature_to_vector(const std::shared_ptr& feat); - std::vector> m_train_idx; - std::vector> m_test_idx; + std::array, 3> m_train_idx; + std::array, 3> m_test_idx; + std::pair m_data_count; + int32_t m_repeat_count; + int32_t m_fold_count; }; } // namespace shogun #endif // SHOGUN_OPENMLSPLIT_H diff --git a/src/shogun/io/openml/OpenMLTask.cpp b/src/shogun/io/openml/OpenMLTask.cpp index 493664a8367..f1382d3225e 100644 --- a/src/shogun/io/openml/OpenMLTask.cpp +++ b/src/shogun/io/openml/OpenMLTask.cpp @@ -118,20 +118,31 @@ OpenMLTask::get_task_from_string(const std::string& task_type) SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) } -std::vector> OpenMLTask::get_train_indices() const +std::vector>> +OpenMLTask::get_train_indices() const { return get_indices(m_split->get_train_idx()); } -std::vector> OpenMLTask::get_test_indices() const +std::vector>> +OpenMLTask::get_test_indices() const { return get_indices(m_split->get_test_idx()); } -std::vector> -OpenMLTask::get_indices(const std::vector>& idx) const +std::vector>> +OpenMLTask::get_indices(const std::array, 3>& idx) const { - SG_SNOTIMPLEMENTED - std::vector> result; + // result = (n_repeats, n_folds, ?) where is the number of indices in a + // given fold + std::vector>> result( + m_split->get_num_repeats(), + std::vector>( + m_split->get_num_folds(), std::vector{})); + for (int i = 0; i < idx[0].size(); ++i) + { + // result[repeat][fold].push_back(data_index) + result[idx[1][i]][idx[2][i]].push_back(idx[0][i]); + } return result; } \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLTask.h b/src/shogun/io/openml/OpenMLTask.h index a849ea37de9..7115b87bf71 100644 --- a/src/shogun/io/openml/OpenMLTask.h +++ b/src/shogun/io/openml/OpenMLTask.h @@ -62,9 +62,9 @@ namespace shogun return m_split; } - std::vector> get_train_indices() const; + std::vector>> get_train_indices() const; - std::vector> get_test_indices() const; + std::vector>> get_test_indices() const; #ifndef SWIG SG_FORCED_INLINE TaskType @@ -79,8 +79,8 @@ namespace shogun private: static TaskType get_task_from_string(const std::string& task_type); - std::vector> - get_indices(const std::vector>& idx) const; + std::vector>> + get_indices(const std::array, 3>& idx) const; std::string m_task_id; std::string m_task_name; diff --git a/src/shogun/io/openml/utils.h b/src/shogun/io/openml/utils.h index a51da80857e..122ce170098 100644 --- a/src/shogun/io/openml/utils.h +++ b/src/shogun/io/openml/utils.h @@ -44,10 +44,14 @@ namespace shogun static SG_FORCED_INLINE void emplace_string_to_map( const rapidjson::GenericValue>& v, std::unordered_map& param_dict, - const std::string& name) + const std::string& name, bool required = false) { if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) param_dict.emplace(name, v[name.c_str()].GetString()); + else if (required) + SG_SERROR( + "The field \"%s\" is expected to be a string!\n", + name.c_str()) else param_dict.emplace(name, ""); } From 677c1e74109bb7573640a642bc4e5674731078a5 Mon Sep 17 00:00:00 2001 From: gf712 Date: Mon, 20 May 2019 08:45:05 +0100 Subject: [PATCH 13/32] added crossvalidation --- src/shogun/io/openml/OpenMLData.cpp | 174 ++++++++++++++------------ src/shogun/io/openml/OpenMLData.h | 3 + src/shogun/io/openml/OpenMLRun.cpp | 143 ++++++++++++++++++--- src/shogun/io/openml/OpenMLTask.cpp | 4 +- src/shogun/io/openml/OpenMLTask.h | 14 ++- src/shogun/io/openml/ShogunOpenML.cpp | 2 +- 6 files changed, 242 insertions(+), 98 deletions(-) diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp index 66dc3dcf482..a85a166ad78 100644 --- a/src/shogun/io/openml/OpenMLData.cpp +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -32,55 +32,55 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) const Value& dataset_description = document["data_set_description"]; auto name = return_if_possible( - "name", dataset_description.GetObject()); + "name", dataset_description.GetObject()); auto description = return_if_possible( - "description", dataset_description.GetObject()); + "description", dataset_description.GetObject()); auto data_format = return_if_possible( - "data_format", dataset_description.GetObject()); + "data_format", dataset_description.GetObject()); auto dataset_id = - return_if_possible("id", dataset_description.GetObject()); + return_if_possible("id", dataset_description.GetObject()); auto version = return_if_possible( - "version", dataset_description.GetObject()); + "version", dataset_description.GetObject()); auto creator = return_if_possible( - "creator", dataset_description.GetObject()); + "creator", dataset_description.GetObject()); auto contributor = return_if_possible( - "contributor", dataset_description.GetObject()); + "contributor", dataset_description.GetObject()); auto collection_date = return_if_possible( - "collection_date", dataset_description.GetObject()); + "collection_date", dataset_description.GetObject()); auto upload_date = return_if_possible( - "upload_date", dataset_description.GetObject()); + "upload_date", dataset_description.GetObject()); auto language = return_if_possible( - "language", dataset_description.GetObject()); + "language", dataset_description.GetObject()); auto licence = return_if_possible( - "licence", dataset_description.GetObject()); + "licence", dataset_description.GetObject()); auto url = - return_if_possible("url", dataset_description.GetObject()); + return_if_possible("url", dataset_description.GetObject()); auto default_target_attribute = return_if_possible( - "default_target_attribute", dataset_description.GetObject()); + "default_target_attribute", dataset_description.GetObject()); auto row_id_attribute = return_if_possible( - "row_id_attribute", dataset_description.GetObject()); + "row_id_attribute", dataset_description.GetObject()); auto ignore_attribute = return_if_possible( - "ignore_attribute", dataset_description.GetObject()); + "ignore_attribute", dataset_description.GetObject()); auto version_label = return_if_possible( - "version_label", dataset_description.GetObject()); + "version_label", dataset_description.GetObject()); auto citation = return_if_possible( - "citation", dataset_description.GetObject()); + "citation", dataset_description.GetObject()); auto tags = return_if_possible>( - "tag", dataset_description.GetObject()); + "tag", dataset_description.GetObject()); auto visibility = return_if_possible( - "visibility", dataset_description.GetObject()); + "visibility", dataset_description.GetObject()); auto original_data_url = return_if_possible( - "original_data_url", dataset_description.GetObject()); + "original_data_url", dataset_description.GetObject()); auto paper_url = return_if_possible( - "paper_url", dataset_description.GetObject()); + "paper_url", dataset_description.GetObject()); auto update_comment = return_if_possible( - "update_comment", dataset_description.GetObject()); + "update_comment", dataset_description.GetObject()); auto md5_checksum = return_if_possible( - "md5_checksum", dataset_description.GetObject()); + "md5_checksum", dataset_description.GetObject()); // features std::vector>> - param_vector; + param_vector; return_string = reader.get("data_features", "json", id); document.Parse(return_string.c_str()); check_response(document, "data_features"); @@ -115,8 +115,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) { if (param_quality.name.IsString() && param_quality.value.IsString()) param_map.emplace( - param_quality.name.GetString(), - param_quality.value.GetString()); + param_quality.name.GetString(), + param_quality.value.GetString()); else if (param_quality.name.IsString()) param_map.emplace(param_quality.name.GetString(), ""); } @@ -124,11 +124,11 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) } auto result = std::make_shared( - name, description, data_format, dataset_id, version, creator, - contributor, collection_date, upload_date, language, licence, url, - default_target_attribute, row_id_attribute, ignore_attribute, - version_label, citation, tags, visibility, original_data_url, paper_url, - update_comment, md5_checksum, param_vector, qualities_vector); + name, description, data_format, dataset_id, version, creator, + contributor, collection_date, upload_date, language, licence, url, + default_target_attribute, row_id_attribute, ignore_attribute, + version_label, citation, tags, visibility, original_data_url, paper_url, + update_comment, md5_checksum, param_vector, qualities_vector); result->set_api_key(api_key); return result; } @@ -145,11 +145,9 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) if (!m_cached_features) get_data(); auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label); + std::find(m_feature_names.begin(), m_feature_names.end(), label); if (find_label == m_feature_names.end()) SG_SERROR("Requested label \"%s\" not in the dataset!\n", label.c_str()) - if (!m_cached_features) - get_data(); auto col_idx = std::distance(m_feature_names.begin(), find_label); auto feat_type_copy = m_feature_types; feat_type_copy.erase(feat_type_copy.begin() + col_idx); @@ -158,84 +156,102 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) if (type == ARFFDeserializer::Attribute::STRING) SG_SERROR("Currently cannot process string features!\n") } - std::shared_ptr result; + // auto result = std::make_shared(); + std::shared_ptr> result; bool first = true; for (int i = 0; i < m_feature_types.size(); ++i) { if (i != col_idx && first) { - result.reset(m_cached_features->get_feature_obj(i)); + result.reset(m_cached_features->get_feature_obj(i) + ->as>()); first = false; } if (i != col_idx) - result.reset(result->create_merged_copy( - m_cached_features->get_feature_obj(i))); + result.reset( + result + ->create_merged_copy(m_cached_features->get_feature_obj(i)) + ->as>()); } - std::dynamic_pointer_cast>(result)->set_num_features(m_feature_types.size()); - std::dynamic_pointer_cast>(result)->set_num_vectors(m_cached_features->get_num_vectors()); + + // need to copy data as result is only in the stack and the data + // will be gone at the end of the function + auto* copy_feat = SG_MALLOC( + float64_t, + m_feature_types.size() * m_cached_features->get_num_vectors()); + memcpy( + copy_feat, result->get_feature_matrix().data(), + m_feature_types.size() * m_cached_features->get_num_vectors()); + + result = std::make_shared>( + copy_feat, m_feature_types.size(), + m_cached_features->get_num_vectors()); return result; } std::shared_ptr OpenMLData::get_labels() { - if (!m_cached_features) - get_data(); REQUIRE( - !m_default_target_attribute.empty(), - "A default target attribute is required if no label is given!\n") + !m_default_target_attribute.empty(), + "A default target attribute is required if no label is given!\n") return get_labels(m_default_target_attribute); } std::shared_ptr OpenMLData::get_labels(const std::string& label_name) { + if (m_cached_labels && label_name == m_cached_label_name) + return m_cached_labels; + if (!m_cached_features) get_data(); + auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label_name); + std::find(m_feature_names.begin(), m_feature_names.end(), label_name); if (find_label == m_feature_names.end()) SG_SERROR( - "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) + "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) auto col_idx = std::distance(m_feature_names.begin(), find_label); - if (!m_cached_features) - get_data(); - auto target_label_as_feat = - std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); + std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); switch (m_feature_types[col_idx]) { - // real features - case ARFFDeserializer::Attribute::REAL: - case ARFFDeserializer::Attribute::NUMERIC: - case ARFFDeserializer::Attribute::INTEGER: - case ARFFDeserializer::Attribute::DATE: - { - auto casted_feat = std::dynamic_pointer_cast>( - target_label_as_feat); - auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); - auto labels = std::make_shared(labels_vec); - return labels; - } - break; - // nominal features - case ARFFDeserializer::Attribute::NOMINAL: + // real features + case ARFFDeserializer::Attribute::REAL: + case ARFFDeserializer::Attribute::NUMERIC: + case ARFFDeserializer::Attribute::INTEGER: + case ARFFDeserializer::Attribute::DATE: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + auto labels = std::make_shared(labels_vec); + m_cached_labels = labels; + m_cached_label_name = label_name; + return m_cached_labels; + } + break; + // nominal features + case ARFFDeserializer::Attribute::NOMINAL: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + for (auto& val : labels_vec) { - auto casted_feat = std::dynamic_pointer_cast>( - target_label_as_feat); - auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); - for(auto& val: labels_vec) - { - if (val == 0) - val = -1; - } - auto labels = std::make_shared(labels_vec); - return labels; + if (val == 0) + val = -1; } - break; - default: - SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) + auto labels = std::make_shared(labels_vec); + m_cached_labels = labels; + m_cached_label_name = label_name; + return m_cached_labels; + } + break; + default: + SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) } return nullptr; @@ -245,7 +261,7 @@ void OpenMLData::get_data() { auto reader = OpenMLReader(m_api_key); std::shared_ptr ss = - std::make_shared(reader.get(m_url)); + std::make_shared(reader.get(m_url)); auto parser = ARFFDeserializer(ss); parser.read(); diff --git a/src/shogun/io/openml/OpenMLData.h b/src/shogun/io/openml/OpenMLData.h index 7e21474775b..25bf54f3084 100644 --- a/src/shogun/io/openml/OpenMLData.h +++ b/src/shogun/io/openml/OpenMLData.h @@ -90,6 +90,7 @@ namespace shogun */ std::shared_ptr get_labels(); + /** * Returns the dataset labels given the label_name * @return the labels @@ -155,6 +156,8 @@ namespace shogun std::shared_ptr m_cached_features; std::vector m_feature_names; std::vector m_feature_types; + std::shared_ptr m_cached_labels; + std::string m_cached_label_name; }; } // namespace shogun diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp index 796545e10de..6c6420f54b3 100644 --- a/src/shogun/io/openml/OpenMLRun.cpp +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -4,6 +4,7 @@ * Authors: Gil Hoben */ +#include #include #include #include @@ -12,36 +13,150 @@ using namespace shogun; std::shared_ptr OpenMLRun::run_model_on_task( - std::shared_ptr model, std::shared_ptr task) + std::shared_ptr model, std::shared_ptr task) { SG_SNOTIMPLEMENTED return std::shared_ptr(); } std::shared_ptr OpenMLRun::run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task) + std::shared_ptr flow, std::shared_ptr task) { auto data = task->get_dataset(); - std::shared_ptr train_features = nullptr, test_features = nullptr; - std::shared_ptr train_labels = nullptr, test_labels = nullptr; + std::shared_ptr features = nullptr; + std::shared_ptr labels = nullptr; + + auto model = ShogunOpenML::flow_to_model(std::move(flow), true); + + labels = data->get_labels(); + features = data->get_features(data->get_default_target_attribute()); + + auto storage = std::make_shared(); if (task->get_split()->contains_splits()) - SG_SNOTIMPLEMENTED + { + auto machine = std::dynamic_pointer_cast(model); + if (!machine) + { + SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") + } + auto train_idx = task->get_train_indices(); + auto test_idx = task->get_test_indices(); + + auto xval_storage = std::make_shared(); + xval_storage->set_num_folds(task->get_num_fold()); + xval_storage->set_num_runs(task->get_num_repeats()); + machine->set_store_model_features(true); + + // copied/adapted from crossvalidation + for (auto repeat_idx : range(task->get_num_repeats())) + { + for (auto fold_idx : range(task->get_num_fold())) + { + auto* fold = new CrossValidationFoldStorage(); + SG_REF(fold) + + auto cloned_machine = (CMachine*)machine->clone(); + + // TODO while these are not used through const interfaces, + // we unfortunately have to clone, even though these could be + // shared + auto features_clone = (CFeatures*)features->clone(); + auto labels_clone = (CLabels*)labels->clone(); + // auto evaluation_criterion = + // (CEvaluation*)m_evaluation_criterion->clone(); + + /* evtl. update xvalidation output class */ + fold->set_run_index(repeat_idx); + fold->set_fold_index(fold_idx); + + auto train_fold_idx = SGVector( + train_idx[repeat_idx][fold_idx].data(), + train_idx[repeat_idx][fold_idx].size(), false); + + features_clone->add_subset(train_fold_idx); + + /* set label subset for training */ + labels_clone->add_subset(train_fold_idx); + + SG_SDEBUG( + "train set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + train_fold_idx.to_string().c_str()) + + /* train machine on training features and remove subset */ + SG_SDEBUG("starting training\n") + cloned_machine->set_labels(labels_clone); + cloned_machine->train(features_clone); + SG_SDEBUG("finished training\n") + + /* evtl. update xvalidation output class */ + fold->set_train_indices(train_fold_idx); + auto fold_machine = (CMachine*)cloned_machine->clone(); + fold->set_trained_machine(fold_machine); + SG_UNREF(fold_machine) + + features_clone->remove_subset(); + labels_clone->remove_subset(); + + /* set feature subset for testing (subset method that stores + * pointer) */ + auto test_fold_idx = SGVector( + test_idx[repeat_idx][fold_idx].data(), + test_idx[repeat_idx][fold_idx].size(), false); + features_clone->add_subset(test_fold_idx); + + /* set label subset for testing */ + labels_clone->add_subset(test_fold_idx); + + SG_SDEBUG( + "test set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + test_fold_idx.to_string().c_str()) + + /* apply machine to test features and remove subset */ + SG_SDEBUG("starting evaluation\n") + SG_SDEBUG("%p\n", features_clone) + CLabels* result_labels = cloned_machine->apply(features_clone); + SG_SDEBUG("finished evaluation\n") + features_clone->remove_subset(); + SG_REF(result_labels); + + /* evaluate */ + // results[i] = + // evaluation_criterion->evaluate(result_labels, + //labels); SG_DEBUG("result on fold %d is %f\n", i, results[i]) + + /* evtl. update xvalidation output class */ + // fold->set_test_indices(test_fold_idx); + // fold->set_test_result(result_labels); + // auto* true_labels = (CLabels*)labels->clone(); + // fold->set_test_true_result(true_labels); + // SG_UNREF(true_labels) + // fold->post_update_results(); + // fold->set_evaluation_result(results[i]); + + storage->append_fold_result(fold); + // + // /* clean up, remove subsets */ + // labels->remove_subset(); + SG_UNREF(cloned_machine); + SG_UNREF(features_clone); + SG_UNREF(labels_clone); + // SG_UNREF(evaluation_criterion); + // SG_UNREF(result_labels); + SG_UNREF(fold) + } + } + } else { - train_labels = data->get_labels(); - train_features = - data->get_features(data->get_default_target_attribute()); // ensures delete is called by shared ptr destructor - SG_REF(train_labels.get()) - SG_REF(train_features.get()) - auto model = ShogunOpenML::flow_to_model(std::move(flow), true); - + SG_REF(labels.get()) + SG_REF(features.get()) if (auto machine = std::dynamic_pointer_cast(model)) { auto result = ShogunOpenML::run_model_on_fold( - machine, task, train_features, 0, 0, train_labels, - test_features); + machine, task, features, 0, 0, labels, + nullptr); SG_SDEBUG(result->to_string().c_str()); } else diff --git a/src/shogun/io/openml/OpenMLTask.cpp b/src/shogun/io/openml/OpenMLTask.cpp index f1382d3225e..c4a4c1fb00c 100644 --- a/src/shogun/io/openml/OpenMLTask.cpp +++ b/src/shogun/io/openml/OpenMLTask.cpp @@ -118,13 +118,13 @@ OpenMLTask::get_task_from_string(const std::string& task_type) SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) } -std::vector>> +std::vector>> OpenMLTask::get_train_indices() const { return get_indices(m_split->get_train_idx()); } -std::vector>> +std::vector>> OpenMLTask::get_test_indices() const { return get_indices(m_split->get_test_idx()); diff --git a/src/shogun/io/openml/OpenMLTask.h b/src/shogun/io/openml/OpenMLTask.h index 7115b87bf71..9f5e97c1bef 100644 --- a/src/shogun/io/openml/OpenMLTask.h +++ b/src/shogun/io/openml/OpenMLTask.h @@ -62,9 +62,19 @@ namespace shogun return m_split; } - std::vector>> get_train_indices() const; + std::vector>> get_train_indices() const; - std::vector>> get_test_indices() const; + std::vector>> get_test_indices() const; + + int32_t get_num_fold() const noexcept + { + return m_split->get_num_folds(); + } + + int32_t get_num_repeats() const noexcept + { + return m_split->get_num_repeats(); + } #ifndef SWIG SG_FORCED_INLINE TaskType diff --git a/src/shogun/io/openml/ShogunOpenML.cpp b/src/shogun/io/openml/ShogunOpenML.cpp index 80dbc03810f..aff02d90a62 100644 --- a/src/shogun/io/openml/ShogunOpenML.cpp +++ b/src/shogun/io/openml/ShogunOpenML.cpp @@ -310,7 +310,7 @@ std::shared_ptr ShogunOpenML::run_model_on_fold( { if (auto machine = std::dynamic_pointer_cast(model_clone)) { - // TODO: refactor. more useless clones until smart pointers are merged + // TODO: refactor! more useless clones until smart pointers are merged machine->put("labels", y_train->clone()->as()); auto tmp = X_train.get(); machine->train(tmp); From d06fe2f8f204066691f698d8752c501a2e430824 Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 21 May 2019 15:52:55 +0100 Subject: [PATCH 14/32] some cleanups --- cmake/external/bitsery.cmake | 14 + src/shogun/CMakeLists.txt | 3 + src/shogun/io/openml/OpenMLData.cpp | 28 +- src/shogun/io/openml/OpenMLFile.cpp | 117 ++++++++ .../openml/{OpenMLReader.h => OpenMLFile.h} | 15 +- src/shogun/io/openml/OpenMLFlow.cpp | 202 ++++++++++--- src/shogun/io/openml/OpenMLFlow.h | 97 +++++- src/shogun/io/openml/OpenMLReader.cpp | 97 ------ src/shogun/io/openml/OpenMLRun.cpp | 178 ++++------- src/shogun/io/openml/OpenMLRun.h | 4 +- src/shogun/io/openml/OpenMLSplit.cpp | 4 +- src/shogun/io/openml/OpenMLTask.cpp | 67 +++-- src/shogun/io/openml/OpenMLTask.h | 13 +- src/shogun/io/openml/ShogunOpenML.cpp | 280 +++++++++++++----- src/shogun/io/openml/ShogunOpenML.h | 22 +- src/shogun/io/openml/utils.h | 100 ++++--- 16 files changed, 811 insertions(+), 430 deletions(-) create mode 100644 cmake/external/bitsery.cmake create mode 100644 src/shogun/io/openml/OpenMLFile.cpp rename src/shogun/io/openml/{OpenMLReader.h => OpenMLFile.h} (91%) delete mode 100644 src/shogun/io/openml/OpenMLReader.cpp diff --git a/cmake/external/bitsery.cmake b/cmake/external/bitsery.cmake new file mode 100644 index 00000000000..1ae39553e46 --- /dev/null +++ b/cmake/external/bitsery.cmake @@ -0,0 +1,14 @@ +set(BITSERY_PREFIX ${CMAKE_BINARY_DIR}/bitsery) +set(BITSERY_INCLUDE_DIR "${BITSERY_PREFIX}/src/bitsery/include") + +include(ExternalProject) +ExternalProject_Add( + bitsery + PREFIX ${BITSERY_PREFIX} + DOWNLOAD_DIR ${THIRD_PARTY_DIR}/bitsery + URL https://github.com/fraillt/bitsery/archive/v4.6.0.tar.gz + URL_MD5 accb462f98a59ed2bc4ffa7de374c24b + INSTALL_COMMAND "" +) + +LIST(APPEND SHOGUN_DEPENDS bitsery) \ No newline at end of file diff --git a/src/shogun/CMakeLists.txt b/src/shogun/CMakeLists.txt index 506d4863cfd..7e05536e337 100644 --- a/src/shogun/CMakeLists.txt +++ b/src/shogun/CMakeLists.txt @@ -414,6 +414,9 @@ SHOGUN_DEPENDENCIES( include(external/RapidJSON) SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE ${RAPIDJSON_INCLUDE_DIR}) +include(external/bitsery) +SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE ${BITSERY_INCLUDE_DIR}) + if (NOT WIN32) # FIXME: HDF5 linking on WIN32 is broken. # at least with the hdf5 supplied in anaconda diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp index a85a166ad78..728cb6b1484 100644 --- a/src/shogun/io/openml/OpenMLData.cpp +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -9,11 +9,9 @@ #include #include -#include +#include #include -#include - using namespace shogun; using namespace shogun::openml_detail; using namespace rapidjson; @@ -22,14 +20,10 @@ std::shared_ptr OpenMLData::get_dataset(const std::string& id, const std::string& api_key) { // description - Document document; - auto reader = OpenMLReader(api_key); + auto reader = OpenMLFile(api_key); auto return_string = reader.get("dataset_description", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_set_description"); - - const Value& dataset_description = document["data_set_description"]; + auto& dataset_description = check_response(return_string, "data_set_description"); auto name = return_if_possible( "name", dataset_description.GetObject()); @@ -82,9 +76,9 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) std::vector>> param_vector; return_string = reader.get("data_features", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_features"); - const Value& dataset_features = document["data_features"]; + + auto& dataset_features = check_response(return_string, "data_features"); + for (const auto& param : dataset_features["feature"].GetArray()) { std::unordered_map> param_map; @@ -105,9 +99,9 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) // qualities std::vector> qualities_vector; return_string = reader.get("data_qualities", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_qualities"); - const Value& data_qualities = document["data_qualities"]; + + auto& data_qualities = check_response(return_string, "data_qualities"); + for (const auto& param : data_qualities["quality"].GetArray()) { std::unordered_map param_map; @@ -154,7 +148,7 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) for (const auto type : feat_type_copy) { if (type == ARFFDeserializer::Attribute::STRING) - SG_SERROR("Currently cannot process string features!\n") + SG_SNOTIMPLEMENTED } // auto result = std::make_shared(); std::shared_ptr> result; @@ -259,7 +253,7 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) void OpenMLData::get_data() { - auto reader = OpenMLReader(m_api_key); + auto reader = OpenMLFile(m_api_key); std::shared_ptr ss = std::make_shared(reader.get(m_url)); diff --git a/src/shogun/io/openml/OpenMLFile.cpp b/src/shogun/io/openml/OpenMLFile.cpp new file mode 100644 index 00000000000..3e20e4b264f --- /dev/null +++ b/src/shogun/io/openml/OpenMLFile.cpp @@ -0,0 +1,117 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#ifdef HAVE_CURL +#include "OpenMLFile.h" +#include + +#endif // HAVE_CURL + +using namespace shogun; + +/** + * The writer callback function used to write the packets to a C++ string. + * @param data the data received in CURL request + * @param size always 1 + * @param nmemb the size of data + * @param buffer_in the buffer to write to + * @return the size of buffer that was written + */ +size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) +{ + // check that the buffer string points to something + if (buffer_in != nullptr) + { + // Append the data to the buffer + buffer_in->append(data, size * nmemb); + + return size * nmemb; + } + return 0; +} + +/* OpenML server format */ +const char* OpenMLFile::xml_server = "https://www.openml.org/api/v1/xml"; +const char* OpenMLFile::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLFile::download_server = ""; +const char* OpenMLFile::splits_server = "https://www.openml.org/api_splits"; + +/* DATA API */ +const char* OpenMLFile::dataset_description = "/data/{}"; +const char* OpenMLFile::list_data_qualities = "/data/qualities/list"; +const char* OpenMLFile::data_features = "/data/features/{}"; +const char* OpenMLFile::data_qualities = "/data/qualities/{}"; +const char* OpenMLFile::list_dataset_qualities = "/data/qualities/{}"; +const char* OpenMLFile::list_dataset_filter = "/data/list/{}"; +/* FLOW API */ +const char* OpenMLFile::flow_file = "/flow/{}"; +const char* OpenMLFile::flow_exists = "/flow/exists/{}"; +/* TASK API */ +const char* OpenMLFile::task_file = "/task/{}"; +/* SPLIT API */ +const char* OpenMLFile::get_split = ""; + +const std::unordered_map + OpenMLFile::m_format_options = {{"xml", xml_server}, + {"json", json_server}, + {"split", splits_server}, + {"download", download_server}}; +const std::unordered_map + OpenMLFile::m_request_options = { + {"dataset_description", dataset_description}, + {"list_data_qualities", list_data_qualities}, + {"data_features", data_features}, + {"data_qualities", data_qualities}, + {"list_dataset_qualities", list_dataset_qualities}, + {"list_dataset_filter", list_dataset_filter}, + {"flow_file", flow_file}, + {"flow_exists", flow_exists}, + {"task_file", task_file}}; + +void OpenMLFile::openml_curl_request_helper(const std::string& url) +{ +#ifdef HAVE_CURL + auto curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.\n") + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); + + CURLcode res = curl_easy_perform(curl_handle); + + if (res != CURLE_OK) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) + + curl_easy_cleanup(curl_handle); +#endif // HAVE_CURL +} + +std::string OpenMLFile::encode_string(const std::string& s) +{ +#ifdef HAVE_CURL + auto curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.\n") + } + + char* encoded_url = curl_easy_escape(curl_handle, s.c_str(), s.size()); + if (!encoded_url) + SG_SERROR("Failed to encode \"%s\" URL escaped.\n", s.c_str()) + return encoded_url; +#else + return s; +#endif +} diff --git a/src/shogun/io/openml/OpenMLReader.h b/src/shogun/io/openml/OpenMLFile.h similarity index 91% rename from src/shogun/io/openml/OpenMLReader.h rename to src/shogun/io/openml/OpenMLFile.h index 8078b1e78ec..ef1796b09ff 100644 --- a/src/shogun/io/openml/OpenMLReader.h +++ b/src/shogun/io/openml/OpenMLFile.h @@ -20,11 +20,11 @@ namespace shogun /** * Reads OpenML streams which can be downloaded with this function. */ - class OpenMLReader + class OpenMLFile { public: - explicit OpenMLReader(const std::string& api_key) : m_api_key(api_key) + explicit OpenMLFile(const std::string& api_key) : m_api_key(api_key) { } @@ -93,7 +93,7 @@ namespace shogun std::string args_string = std::accumulate( args_vec.begin() + 1, args_vec.end(), args_vec.front(), [](std::string s0, std::string& s1) { - return s0 += "/" + s1; + return s0 += "/" + encode_string(s1); }); request_path += args_string; } @@ -113,11 +113,14 @@ namespace shogun return m_curl_response_buffer; #else SG_SERROR( - "This function is only available with the CURL library!\n") + "Please compile shogun with libcurl to query the OpenML server!\n") #endif // HAVE_CURL } private: + + static std::string encode_string(const std::string& s); + /** the raw buffer as a C++ string */ std::string m_curl_response_buffer; @@ -158,6 +161,7 @@ namespace shogun /* FLOW API */ static const char* flow_file; + static const char* flow_exists; /* TASK API */ static const char* task_file; @@ -174,6 +178,9 @@ namespace shogun public: OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; + template + bool post(const std::string& request, const std::string& format, const std::string& message, Args... args); + private: /** the user API key, likely to be needed to write to OpenML */ std::string m_api_key; diff --git a/src/shogun/io/openml/OpenMLFlow.cpp b/src/shogun/io/openml/OpenMLFlow.cpp index dd2f3f839a6..e36ecd251e3 100644 --- a/src/shogun/io/openml/OpenMLFlow.cpp +++ b/src/shogun/io/openml/OpenMLFlow.cpp @@ -4,9 +4,11 @@ * Authors: Gil Hoben */ -#include +#include +#include -#include +#include +#include #include #include @@ -15,50 +17,130 @@ using namespace shogun; using namespace shogun::openml_detail; using namespace rapidjson; +/** + * + */ +class ShogunToString : public AnyVisitor +{ +public: + explicit ShogunToString(){SG_SDEBUG("Debugging ShogunToString\n")}; + + void on(bool* v) final + { + m_string_val = (*v ? "true" : "false"); + } + void on(int32_t* v) final + { + m_string_val = std::to_string(*v); + } + void on(int64_t* v) final + { + m_string_val = std::to_string(*v); + } + void on(float* v) final + { + m_string_val = std::to_string(*v); + } + void on(double* v) final + { + m_string_val = std::to_string(*v); + } + void on(long double* v) + { + m_string_val = std::to_string(*v); + } + void on(CSGObject** v) final + { + m_string_val = ""; + } + void on(SGVector* v) final + { + m_string_val = v->to_string(); + } + void on(SGVector* v) final + { + m_string_val = v->to_string(); + } + void on(SGVector* v) final + { + m_string_val = v->to_string(); + } + void on(SGMatrix* mat) final + { + m_string_val = mat->to_string(); + } + void on(SGMatrix* mat) final + { + m_string_val = mat->to_string(); + } + void on(SGMatrix* mat) final + { + m_string_val = mat->to_string(); + } + + SG_FORCED_INLINE std::string get_string_value() const noexcept + { + return m_string_val; + } + +private: + std::string m_string_val; +}; std::shared_ptr OpenMLFlow::download_flow( - const std::string& flow_id, const std::string& api_key) + const std::string& flow_id, const std::string& api_key) { - Document document; parameters_type params; components_type components; - std::string name; - std::string description; - std::string class_name; // get flow and parse with RapidJSON - auto reader = OpenMLReader(api_key); + auto reader = OpenMLFile(api_key); auto return_string = reader.get("flow_file", "json", flow_id); - document.Parse(return_string.c_str()); - check_response(document, "flow"); - // store root for convenience. We know it exists from previous check. - const Value& root = document["flow"]; + auto& root = check_response(return_string, "flow"); + + std::string name = + return_if_possible("name", root.GetObject()); + std::string description = + return_if_possible("description", root.GetObject()); + std::string class_name = + return_if_possible("class_name", root.GetObject()); + std::string external_version = + return_if_possible("external_version", root.GetObject()); + + REQUIRE( + root["id"].GetString() == flow_id, + "The flow id returned by the server does not match the id provided. " + "Got %s instead of %s.\n", + root["id"].GetString(), flow_id.c_str()) // handle parameters if (root.HasMember("parameter")) { std::unordered_map param_dict; + OpenMLFlowParameter params_i{}; if (root["parameter"].IsArray()) { for (const auto& v : root["parameter"].GetArray()) { - emplace_string_to_map(v, param_dict, "data_type"); - emplace_string_to_map(v, param_dict, "default_value"); - emplace_string_to_map(v, param_dict, "description"); - params.emplace(v["name"].GetString(), param_dict); - param_dict.clear(); + add_string_to_struct(v, "name", params_i.name); + add_string_to_struct(v, "data_type", params_i.data_type); + add_string_to_struct( + v, "default_value", params_i.default_value); + add_string_to_struct(v, "description", params_i.description); + params.emplace(params_i.name, params_i); } } else { // parameter can also be a dict, instead of array - const auto v = root["parameter"].GetObject(); - emplace_string_to_map(v, param_dict, "data_type"); - emplace_string_to_map(v, param_dict, "default_value"); - emplace_string_to_map(v, param_dict, "description"); - params.emplace(v["name"].GetString(), param_dict); + const auto& v = root["parameter"].GetObject(); + add_string_to_struct(v, "name", params_i.name); + add_string_to_struct(v, "data_type", params_i.data_type); + add_string_to_struct(v, "default_value", params_i.default_value); + add_string_to_struct(v, "description", params_i.description); + params.emplace(params_i.name, params_i); } } @@ -70,30 +152,23 @@ std::shared_ptr OpenMLFlow::download_flow( for (const auto& v : root["component"].GetArray()) { components.emplace( - v["identifier"].GetString(), - OpenMLFlow::download_flow( - v["flow"]["id"].GetString(), api_key)); + must_return("identifier", v), + OpenMLFlow::download_flow( + v["flow"]["id"].GetString(), api_key)); } } else { components.emplace( - root["component"]["identifier"].GetString(), - OpenMLFlow::download_flow( - root["component"]["flow"]["id"].GetString(), api_key)); + must_return("identifier", root["component"]), + OpenMLFlow::download_flow( + root["component"]["flow"]["id"].GetString(), api_key)); } } - // get remaining information from flow - if (root.HasMember("name")) - name = root["name"].GetString(); - if (root.HasMember("description")) - description = root["description"].GetString(); - if (root.HasMember("class_name")) - class_name = root["class_name"].GetString(); - auto flow = std::make_shared( - name, description, class_name, components, params); + flow_id, name, description, class_name, external_version, components, + params); return flow; } @@ -112,4 +187,55 @@ std::shared_ptr OpenMLFlow::from_file() { SG_SNOTIMPLEMENTED; return std::shared_ptr(); -} \ No newline at end of file +} + +bool OpenMLFlow::exists_on_server() +{ + // check if flow with same name and version exists + auto reader = std::make_unique(""); + auto flow_exists_string = + reader->get("flow_exists", "json", m_name, m_external_version); + + auto& root = + check_response(flow_exists_string, "flow_exists"); + + return strcmp(root["exists"].GetString(), "true") == 0; +} + +std::vector> +OpenMLFlow::obtain_parameter_values(const std::shared_ptr& model) +{ + std::vector> result; + auto obj_param = model->get_params(); + auto visitor = std::make_unique(); + + result.reserve(m_parameters.size()); + + for (const auto& param : m_parameters) + { + Any any_val = obj_param.at(param.first)->get_value(); + any_val.visit(visitor.get()); + // nested objects are handled below + if (!visitor->get_string_value().empty()) + { + // result.emplace_back to call OpenMLParameterValues constructor + // doesn't work here, so create a temporary value with make_shared + // and then push_back + auto val = std::make_shared( + param.first, m_flow_id, visitor->get_string_value()); + result.push_back(val); + } + } + + for (const auto& components : m_components) + { + // TODO: remove std::shared_ptr when smart pointers available + auto obj = std::shared_ptr(model->get(components.first)); + auto val = std::make_shared( + components.first, m_flow_id, + components.second->obtain_parameter_values(obj)); + result.push_back(val); + } + + return result; +} diff --git a/src/shogun/io/openml/OpenMLFlow.h b/src/shogun/io/openml/OpenMLFlow.h index e8424e58343..7ff9f4ff5e0 100644 --- a/src/shogun/io/openml/OpenMLFlow.h +++ b/src/shogun/io/openml/OpenMLFlow.h @@ -7,14 +7,53 @@ #ifndef SHOGUN_OPENMLFLOW_H #define SHOGUN_OPENMLFLOW_H -#include +#include +#include #include #include - namespace shogun { + /** + * OpenML representation of a parameter + */ + struct OpenMLParameterValues + { + /** name of parameter */ + std::string name; + /** id of flow parameter belongs to */ + std::string component; + /** value of parameter or a nested object */ + shogun::variant< + std::string, std::vector>> + value; + + OpenMLParameterValues( + const std::string& name_, const std::string& component_, + shogun::variant< + std::string, + std::vector>> + value_) + : name(name_), component(component_), value(std::move(value_)) + { + } + }; + + /** + * OpenML description of a flow parameter + */ + struct OpenMLFlowParameter + { + /** name of parameter */ + std::string name; + /** primitive type of parameter */ + std::string data_type; + /** the default value, i.e. the value published to OpenML */ + std::string default_value; + /** the parameter description */ + std::string description; + }; /** * Handles OpenML flows. A flow contains the information * required to instantiate a model. @@ -28,8 +67,7 @@ namespace shogun std::unordered_map>; /** alias for parameter type, map of maps with information specific to a * parameter */ - using parameters_type = std::unordered_map< - std::string, std::unordered_map>; + using parameters_type = std::unordered_map; /** * The OpenMLFlow constructor. This constructor is rarely used by the @@ -45,10 +83,12 @@ namespace shogun * for each parameter name */ OpenMLFlow( - const std::string& name, const std::string& description, - const std::string& model, components_type components, + const std::string& flow_id, const std::string& name, + const std::string& description, const std::string& model, + const std::string& external_version, components_type components, parameters_type parameters) - : m_name(name), m_description(description), m_class_name(model), + : m_flow_id(flow_id), m_name(name), m_description(description), + m_class_name(model), m_external_version(external_version), m_parameters(std::move(parameters)), m_components(std::move(components)) { @@ -85,6 +125,11 @@ namespace shogun */ void dump() const; + bool exists_on_server(); + + std::vector> + obtain_parameter_values(const std::shared_ptr& model); + /** * Gets a subflow, i.e. a kernel in a machine * @param name the name of the subflow, not the flow ID @@ -102,40 +147,60 @@ namespace shogun } #ifndef SWIG - SG_FORCED_INLINE parameters_type - - get_parameters() const noexcept + SG_FORCED_INLINE parameters_type get_parameters() const noexcept { return m_parameters; } - SG_FORCED_INLINE components_type - - get_components() const noexcept + SG_FORCED_INLINE components_type get_components() const noexcept { return m_components; } - SG_FORCED_INLINE std::string - - get_class_name() const noexcept + SG_FORCED_INLINE std::string get_class_name() const noexcept { return m_class_name; } + SG_FORCED_INLINE std::string get_version() const noexcept + { + return m_external_version; + } + + SG_FORCED_INLINE std::string get_name() const noexcept + { + return m_name; + } + + SG_FORCED_INLINE std::string get_flow_id() const noexcept + { + return m_flow_id; + } + + SG_FORCED_INLINE void set_model(std::shared_ptr model) noexcept + { + m_model = model; + } + #endif // SWIG private: + /** flow unique id */ + std::string m_flow_id; /** name field of the flow */ std::string m_name; /** description field of the flow */ std::string m_description; /** the class_name field of the flow */ std::string m_class_name; + /** the external version, e.g. "shogun=7.0.0" */ + std::string m_external_version; /** the parameter field of the flow (optional) */ parameters_type m_parameters; /** the components fields of the flow (optional) */ components_type m_components; + /** the associated model (optional) */ + std::shared_ptr m_model; }; } // namespace shogun diff --git a/src/shogun/io/openml/OpenMLReader.cpp b/src/shogun/io/openml/OpenMLReader.cpp deleted file mode 100644 index 3751e686d19..00000000000 --- a/src/shogun/io/openml/OpenMLReader.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Gil Hoben - */ - -#include - -#ifdef HAVE_CURL -#include -#endif // HAVE_CURL - -using namespace shogun; - -/** - * The writer callback function used to write the packets to a C++ string. - * @param data the data received in CURL request - * @param size always 1 - * @param nmemb the size of data - * @param buffer_in the buffer to write to - * @return the size of buffer that was written - */ -size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) -{ - // check that the buffer string points to something - if (buffer_in != nullptr) - { - // Append the data to the buffer - buffer_in->append(data, size * nmemb); - - return size * nmemb; - } - return 0; -} - -/* OpenML server format */ -const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; -const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; -const char* OpenMLReader::download_server = ""; -const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; - -/* DATA API */ -const char* OpenMLReader::dataset_description = "/data/{}"; -const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; -const char* OpenMLReader::data_features = "/data/features/{}"; -const char* OpenMLReader::data_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; -/* FLOW API */ -const char* OpenMLReader::flow_file = "/flow/{}"; -/* TASK API */ -const char* OpenMLReader::task_file = "/task/{}"; -/* SPLIT API */ -const char* OpenMLReader::get_split = ""; - -const std::unordered_map - OpenMLReader::m_format_options = {{"xml", xml_server}, - {"json", json_server}, - {"split", splits_server}, - {"download", download_server}}; -const std::unordered_map - OpenMLReader::m_request_options = { - {"dataset_description", dataset_description}, - {"list_data_qualities", list_data_qualities}, - {"data_features", data_features}, - {"data_qualities", data_qualities}, - {"list_dataset_qualities", list_dataset_qualities}, - {"list_dataset_filter", list_dataset_filter}, - {"flow_file", flow_file}, - {"task_file", task_file}}; - -void OpenMLReader::openml_curl_request_helper(const std::string& url) -{ -#ifdef HAVE_CURL - CURL* curl_handle = nullptr; - - curl_handle = curl_easy_init(); - - if (!curl_handle) - { - SG_SERROR("Failed to initialise curl handle.\n") - return; - } - - curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); - curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); - curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); - - CURLcode res = curl_easy_perform(curl_handle); - - if (res != CURLE_OK) - SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) - - curl_easy_cleanup(curl_handle); -#endif // HAVE_CURL -} diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp index 6c6420f54b3..465c5045c21 100644 --- a/src/shogun/io/openml/OpenMLRun.cpp +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -5,12 +5,16 @@ */ #include +#include #include #include +#include #include #include using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; std::shared_ptr OpenMLRun::run_model_on_task( std::shared_ptr model, std::shared_ptr task) @@ -20,149 +24,87 @@ std::shared_ptr OpenMLRun::run_model_on_task( } std::shared_ptr OpenMLRun::run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task) + std::shared_ptr flow, std::shared_ptr task, + bool avoid_duplicate_runs) { + if (avoid_duplicate_runs && flow->exists_on_server()) + { + auto flow_from_server = + OpenMLFlow::download_flow(flow->get_flow_id(), ""); + } + auto data = task->get_dataset(); - std::shared_ptr features = nullptr; - std::shared_ptr labels = nullptr; - auto model = ShogunOpenML::flow_to_model(std::move(flow), true); + auto model = ShogunOpenML::flow_to_model(flow, true); + flow->set_model(model); - labels = data->get_labels(); - features = data->get_features(data->get_default_target_attribute()); + auto labels = data->get_labels(); + auto features = data->get_features(data->get_default_target_attribute()); + + auto machine = std::dynamic_pointer_cast(model); + if (!machine) + { + SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") + } - auto storage = std::make_shared(); + auto* xval_storage = new CrossValidationStorage(); if (task->get_split()->contains_splits()) { - auto machine = std::dynamic_pointer_cast(model); - if (!machine) - { - SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") - } auto train_idx = task->get_train_indices(); auto test_idx = task->get_test_indices(); - auto xval_storage = std::make_shared(); - xval_storage->set_num_folds(task->get_num_fold()); xval_storage->set_num_runs(task->get_num_repeats()); + xval_storage->set_num_folds(task->get_num_fold()); + machine->set_store_model_features(true); - // copied/adapted from crossvalidation for (auto repeat_idx : range(task->get_num_repeats())) { for (auto fold_idx : range(task->get_num_fold())) { - auto* fold = new CrossValidationFoldStorage(); - SG_REF(fold) - - auto cloned_machine = (CMachine*)machine->clone(); - - // TODO while these are not used through const interfaces, - // we unfortunately have to clone, even though these could be - // shared - auto features_clone = (CFeatures*)features->clone(); - auto labels_clone = (CLabels*)labels->clone(); - // auto evaluation_criterion = - // (CEvaluation*)m_evaluation_criterion->clone(); - - /* evtl. update xvalidation output class */ - fold->set_run_index(repeat_idx); - fold->set_fold_index(fold_idx); - - auto train_fold_idx = SGVector( + SGVector train_i_idx( train_idx[repeat_idx][fold_idx].data(), - train_idx[repeat_idx][fold_idx].size(), false); - - features_clone->add_subset(train_fold_idx); - - /* set label subset for training */ - labels_clone->add_subset(train_fold_idx); - - SG_SDEBUG( - "train set repeat %d fold %d: %s\n", repeat_idx, fold_idx, - train_fold_idx.to_string().c_str()) - - /* train machine on training features and remove subset */ - SG_SDEBUG("starting training\n") - cloned_machine->set_labels(labels_clone); - cloned_machine->train(features_clone); - SG_SDEBUG("finished training\n") - - /* evtl. update xvalidation output class */ - fold->set_train_indices(train_fold_idx); - auto fold_machine = (CMachine*)cloned_machine->clone(); - fold->set_trained_machine(fold_machine); - SG_UNREF(fold_machine) - - features_clone->remove_subset(); - labels_clone->remove_subset(); - - /* set feature subset for testing (subset method that stores - * pointer) */ - auto test_fold_idx = SGVector( - test_idx[repeat_idx][fold_idx].data(), - test_idx[repeat_idx][fold_idx].size(), false); - features_clone->add_subset(test_fold_idx); - - /* set label subset for testing */ - labels_clone->add_subset(test_fold_idx); - - SG_SDEBUG( - "test set repeat %d fold %d: %s\n", repeat_idx, fold_idx, - test_fold_idx.to_string().c_str()) - - /* apply machine to test features and remove subset */ - SG_SDEBUG("starting evaluation\n") - SG_SDEBUG("%p\n", features_clone) - CLabels* result_labels = cloned_machine->apply(features_clone); - SG_SDEBUG("finished evaluation\n") - features_clone->remove_subset(); - SG_REF(result_labels); - - /* evaluate */ - // results[i] = - // evaluation_criterion->evaluate(result_labels, - //labels); SG_DEBUG("result on fold %d is %f\n", i, results[i]) - - /* evtl. update xvalidation output class */ - // fold->set_test_indices(test_fold_idx); - // fold->set_test_result(result_labels); - // auto* true_labels = (CLabels*)labels->clone(); - // fold->set_test_true_result(true_labels); - // SG_UNREF(true_labels) - // fold->post_update_results(); - // fold->set_evaluation_result(results[i]); - - storage->append_fold_result(fold); - // - // /* clean up, remove subsets */ - // labels->remove_subset(); - SG_UNREF(cloned_machine); - SG_UNREF(features_clone); - SG_UNREF(labels_clone); - // SG_UNREF(evaluation_criterion); - // SG_UNREF(result_labels); - SG_UNREF(fold) + train_idx[repeat_idx][fold_idx].size()); + SGVector test_i_idx( + train_idx[repeat_idx][fold_idx].data(), + train_idx[repeat_idx][fold_idx].size()); + xval_storage->append_fold_result( + ShogunOpenML::run_model_on_fold( + machine, task, features, labels, train_i_idx, + test_i_idx, repeat_idx, fold_idx) + .release()); } } } else { - // ensures delete is called by shared ptr destructor - SG_REF(labels.get()) - SG_REF(features.get()) - if (auto machine = std::dynamic_pointer_cast(model)) - { - auto result = ShogunOpenML::run_model_on_fold( - machine, task, features, 0, 0, labels, - nullptr); - SG_SDEBUG(result->to_string().c_str()); - } - else - SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") + xval_storage->set_num_runs(0); + xval_storage->set_num_folds(0); + xval_storage->append_fold_result( + ShogunOpenML::run_model_on_fold(machine, task, features, labels) + .release()); } - return std::shared_ptr(); + SG_SDEBUG("End of openml run: %s\n", xval_storage->to_string().c_str()); + + return std::make_shared( + nullptr, // uploader + nullptr, // uploader_name + nullptr, // setup_id + nullptr, // setup_string + nullptr, // parameter_settings + std::vector{}, // evaluations + std::vector{}, // fold_evaluations + std::vector{}, // sample_evaluations + nullptr, // data_content + std::vector{}, // output_files + task, // task + flow, // flow + nullptr, // run_id + model, // model + std::vector{}, // tags + nullptr // predictions_url + ); } std::shared_ptr diff --git a/src/shogun/io/openml/OpenMLRun.h b/src/shogun/io/openml/OpenMLRun.h index 9046d09bea9..3619bed62d1 100644 --- a/src/shogun/io/openml/OpenMLRun.h +++ b/src/shogun/io/openml/OpenMLRun.h @@ -46,7 +46,9 @@ namespace shogun { from_filesystem(const std::string& directory); static std::shared_ptr run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task); + std::shared_ptr flow, + std::shared_ptr task, + bool avoid_duplicate_runs=true); static std::shared_ptr run_model_on_task( std::shared_ptr model, std::shared_ptr task); diff --git a/src/shogun/io/openml/OpenMLSplit.cpp b/src/shogun/io/openml/OpenMLSplit.cpp index 19569137e0d..42d14520168 100644 --- a/src/shogun/io/openml/OpenMLSplit.cpp +++ b/src/shogun/io/openml/OpenMLSplit.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include #include using namespace shogun; @@ -17,7 +17,7 @@ using namespace shogun; std::shared_ptr OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) { - auto reader = OpenMLReader(api_key); + auto reader = OpenMLFile(api_key); auto return_string = reader.get(split_url); if (return_string == "Task not providing datasplits.") diff --git a/src/shogun/io/openml/OpenMLTask.cpp b/src/shogun/io/openml/OpenMLTask.cpp index c4a4c1fb00c..f41bd3d0cc7 100644 --- a/src/shogun/io/openml/OpenMLTask.cpp +++ b/src/shogun/io/openml/OpenMLTask.cpp @@ -4,7 +4,7 @@ * Authors: Gil Hoben */ -#include +#include #include #include @@ -12,23 +12,34 @@ using namespace shogun; using namespace shogun::openml_detail; using namespace rapidjson; +struct DatasetStruct +{ + std::string dataset_id; + std::string target_feature; +}; + +struct SplitStruct +{ + std::string split_id; + std::string type; + std::string data_splits_url; + std::unordered_map split_parameters; +}; + std::shared_ptr OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) { - Document document; std::string task_name; std::string task_type_id; - std::shared_ptr openml_dataset = nullptr; - std::shared_ptr openml_split = nullptr; std::unordered_map evaluation_measures; - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("task_file", "json", task_id); + DatasetStruct dataset_struct{}; + SplitStruct split_struct{}; - document.Parse(return_string.c_str()); - check_response(document, "task"); + auto reader = OpenMLFile(api_key); + auto return_string = reader.get("task_file", "json", task_id); - const Value& root = document["task"]; + auto& root = check_response(return_string, "task"); REQUIRE( task_id == root["task_id"].GetString(), @@ -42,7 +53,7 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) task_type_id = root["task_type_id"].GetString(); // expect two elements in input array: dataset and split - const Value& json_input = root["input"]; + auto& json_input = root["input"]; auto input_array = json_input.GetArray(); @@ -51,19 +62,21 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) if (strcmp(task_settings["name"].GetString(), "source_data") == 0) { auto dataset_info = task_settings["data_set"].GetObject(); - std::string dataset_id = dataset_info["data_set_id"].GetString(); - std::string target_feature = - dataset_info["target_feature"].GetString(); - openml_dataset = OpenMLData::get_dataset(dataset_id, api_key); + add_string_to_struct( + dataset_info, "data_set_id", dataset_struct.dataset_id); + add_string_to_struct( + dataset_info, "target_feature", dataset_struct.target_feature); } else if ( strcmp(task_settings["name"].GetString(), "estimation_procedure") == 0) { auto split_info = task_settings["estimation_procedure"].GetObject(); - std::string split_id = split_info["id"].GetString(); - std::string split_type = split_info["type"].GetString(); - std::string split_url = split_info["data_splits_url"].GetString(); + add_string_to_struct(split_info, "id", split_struct.split_id); + add_string_to_struct(split_info, "type", split_struct.type); + add_string_to_struct( + split_info, "data_splits_url", split_struct.data_splits_url); + std::unordered_map split_parameters; for (const auto& param : split_info["parameter"].GetArray()) { @@ -77,11 +90,12 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) "Unexpected number of parameters in parameter array " "of estimation_procedure.\n") } + split_struct.split_parameters = split_parameters; + REQUIRE( - split_type == "crossvalidation", + split_struct.type == "crossvalidation", "Currently only tasks with cross validation are enabled in " "shogun!\n") - openml_split = OpenMLSplit::get_split(split_url, api_key); } else if ( strcmp(task_settings["name"].GetString(), "evaluation_measures") == @@ -100,8 +114,17 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) } } - if (openml_dataset == nullptr && openml_split == nullptr) - SG_SERROR("Error parsing task.\n") + REQUIRE( + !dataset_struct.dataset_id.empty(), + "The dataset ID is required to retrieve the dataset!\n") + auto openml_dataset = + OpenMLData::get_dataset(dataset_struct.dataset_id, api_key); + + REQUIRE( + !split_struct.data_splits_url.empty(), + "The split URL is required to retrieve the split information!\n") + auto openml_split = + OpenMLSplit::get_split(split_struct.data_splits_url, api_key); auto result = std::make_shared( task_id, task_name, task_type, task_type_id, evaluation_measures, @@ -133,7 +156,7 @@ OpenMLTask::get_test_indices() const std::vector>> OpenMLTask::get_indices(const std::array, 3>& idx) const { - // result = (n_repeats, n_folds, ?) where is the number of indices in a + // result = (n_repeats, n_folds, ?) where ? is the number of indices in a // given fold std::vector>> result( m_split->get_num_repeats(), diff --git a/src/shogun/io/openml/OpenMLTask.h b/src/shogun/io/openml/OpenMLTask.h index 9f5e97c1bef..c248ee7d109 100644 --- a/src/shogun/io/openml/OpenMLTask.h +++ b/src/shogun/io/openml/OpenMLTask.h @@ -62,7 +62,8 @@ namespace shogun return m_split; } - std::vector>> get_train_indices() const; + std::vector>> + get_train_indices() const; std::vector>> get_test_indices() const; @@ -76,14 +77,16 @@ namespace shogun return m_split->get_num_repeats(); } -#ifndef SWIG - SG_FORCED_INLINE TaskType + std::string get_task_name() const noexcept + { + return m_task_name; + } - get_task_type() const noexcept +#ifndef SWIG + SG_FORCED_INLINE TaskType get_task_type() const noexcept { return m_task_type; } - #endif // SWIG private: diff --git a/src/shogun/io/openml/ShogunOpenML.cpp b/src/shogun/io/openml/ShogunOpenML.cpp index aff02d90a62..ed73099f1c3 100644 --- a/src/shogun/io/openml/ShogunOpenML.cpp +++ b/src/shogun/io/openml/ShogunOpenML.cpp @@ -1,3 +1,5 @@ +#include + /* * This software is distributed under BSD 3-clause license (see LICENSE file). * @@ -22,12 +24,14 @@ class StringToShogun : public AnyVisitor { public: explicit StringToShogun(std::shared_ptr model) - : m_model(model), m_parameter(""), m_string_val(""){}; + : m_model(std::move(model)), m_parameter(""), + m_string_val(""){SG_SDEBUG("Debugging StringToShogun\n")}; StringToShogun( - std::shared_ptr model, const std::string& parameter, - const std::string& string_val) - : m_model(model), m_parameter(parameter), m_string_val(string_val){}; + std::shared_ptr model, const std::string& parameter, + const std::string& string_val) + : m_model(std::move(model)), m_parameter(parameter), + m_string_val(string_val){SG_SDEBUG("Debugging StringToShogun\n")}; void on(bool* v) final { @@ -53,11 +57,11 @@ class StringToShogun : public AnyVisitor // it's an option, i.e. internally represented // as an enum but in swig exposed as a string m_string_val.erase( - std::remove_if( - m_string_val.begin(), m_string_val.end(), - // remove quotes - [](const auto& val) { return val == '\"'; }), - m_string_val.end()); + std::remove_if( + m_string_val.begin(), m_string_val.end(), + // remove quotes + [](const auto& val) { return val == '\"'; }), + m_string_val.end()); m_model->put(m_parameter, m_string_val); } } @@ -93,7 +97,7 @@ class StringToShogun : public AnyVisitor void on(long double* v) { SG_SDEBUG( - "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { floatmax_t result = std::stold(m_string_val); @@ -103,38 +107,38 @@ class StringToShogun : public AnyVisitor void on(CSGObject** v) final { SG_SDEBUG( - "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGVector* v) final { SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGVector* v) final { SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGVector* v) final { SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGMatrix* mat) final { SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGMatrix* mat) final { SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) + "SGMatrix: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGMatrix* mat) final{SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} /** * In OpenML "null" is an empty parameter value field. @@ -169,7 +173,7 @@ class StringToShogun : public AnyVisitor * @return the instantiated object using a factory */ std::shared_ptr instantiate_model_from_factory( - const std::string& factory_name, const std::string& algo_name) + const std::string& factory_name, const std::string& algo_name) { if (factory_name == "machine") return std::shared_ptr(machine(algo_name)); @@ -190,9 +194,9 @@ std::shared_ptr instantiate_model_from_factory( * @param parameter_name the name of nested_obj */ void cast_and_put( - const std::shared_ptr& obj, - const std::shared_ptr& nested_obj, - const std::string& parameter_name) + const std::shared_ptr& obj, + const std::shared_ptr& nested_obj, + const std::string& parameter_name) { if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) { @@ -218,7 +222,7 @@ void cast_and_put( } std::shared_ptr ShogunOpenML::flow_to_model( - std::shared_ptr flow, bool initialize_with_defaults) + std::shared_ptr flow, bool initialize_with_defaults) { auto params = flow->get_parameters(); auto components = flow->get_components(); @@ -237,7 +241,7 @@ std::shared_ptr ShogunOpenML::flow_to_model( { Any any_val = obj_param.at(param.first)->get_value(); std::string name = param.first; - std::string val_as_string = param.second.at("default_value"); + std::string val_as_string = param.second.default_value; visitor->set_parameter_name(name); visitor->set_string_value(val_as_string); any_val.visit(visitor.get()); @@ -247,7 +251,7 @@ std::shared_ptr ShogunOpenML::flow_to_model( for (const auto& component : components) { std::shared_ptr nested_obj = - flow_to_model(component.second, initialize_with_defaults); + flow_to_model(component.second, initialize_with_defaults); cast_and_put(obj, nested_obj, component.first); } @@ -280,61 +284,199 @@ ShogunOpenML::get_class_info(const std::string& class_name) class_components.emplace_back(std::string(begin, std::next(it))); } - if (class_components[0] == "shogun" && class_components.size() == 3) + if (class_components.empty()) + SG_SERROR( + "Error parsing flow class name \"%s\"!\n", class_name.c_str()); + + if (class_components.size() == 3 && class_components[0] == "shogun") result = std::make_pair(class_components[1], class_components[2]); - else if (class_components[0] == "shogun" && class_components.size() != 3) + else if (class_components.size() != 3 && class_components[0] == "shogun") SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) else SG_SERROR( - "The provided flow is not meant for shogun deserialisation! The " - "required library is \"%s\".\n", - class_components[0].c_str()) + "The provided flow is not meant for shogun deserialisation! The " + "required library is \"%s\".\n", + class_components[0].c_str()) return result; } -std::shared_ptr ShogunOpenML::run_model_on_fold( - const std::shared_ptr& model, - const std::shared_ptr& task, - const std::shared_ptr& X_train, index_t repeat_number, - index_t fold_number, const std::shared_ptr& y_train, - const std::shared_ptr& X_test) +std::unique_ptr ShogunOpenML::run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels, const SGVector& train_idx, + const SGVector& test_idx, index_t repeat_idx, index_t fold_idx) { auto task_type = task->get_task_type(); - auto model_clone = std::shared_ptr(model->clone()); switch (task_type) { - case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: - case OpenMLTask::TaskType::SUPERVISED_REGRESSION: - { - if (auto machine = std::dynamic_pointer_cast(model_clone)) - { - // TODO: refactor! more useless clones until smart pointers are merged - machine->put("labels", y_train->clone()->as()); - auto tmp = X_train.get(); - machine->train(tmp); - if (X_test) - return std::shared_ptr(machine->apply(X_test.get())); - else - return std::shared_ptr(machine->apply(X_train.get())); - } - else - SG_SERROR("The provided model is not a trainable machine!\n") - } - break; - case OpenMLTask::TaskType::LEARNING_CURVE: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::CLUSTERING: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: - SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + // copied/adapted from crossvalidation + auto fold = std::make_unique(); + auto* cloned_machine = machine->clone()->as(); + + // TODO while these are not used through const interfaces, + // we unfortunately have to clone, even though these could be + // shared + auto* features_clone = features->clone()->as(); + auto* labels_clone = labels->clone()->as(); + // auto* evaluation_criterion = + // (CEvaluation*)m_evaluation_criterion->clone(); + + /* evtl. update xvalidation output class */ + fold->set_run_index(repeat_idx); + fold->set_fold_index(fold_idx); + + /* set feature and label subset for training */ + features_clone->add_subset(train_idx); + labels_clone->add_subset(train_idx); + + SG_SDEBUG( + "train set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + train_idx.to_string().c_str()) + + /* train machine on training features and remove subset */ + SG_SDEBUG("starting training\n") + cloned_machine->set_labels(labels_clone); + cloned_machine->train(features_clone); + SG_SDEBUG("finished training\n") + + /* evtl. update xvalidation output class */ + fold->set_train_indices(train_idx); + auto* fold_machine = cloned_machine->clone()->as(); + fold->set_trained_machine(fold_machine); + SG_UNREF(fold_machine) + + features_clone->remove_subset(); + labels_clone->remove_subset(); + + /* set features and label subset for testing */ + features_clone->add_subset(test_idx); + labels_clone->add_subset(test_idx); + + SG_SDEBUG( + "test set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + test_idx.to_string().c_str()) + + /* apply machine to test features and remove subset */ + SG_SDEBUG("starting evaluation\n") + SG_SDEBUG("%p\n", features_clone) + CLabels* result_labels = cloned_machine->apply(features_clone); + SG_SDEBUG("finished evaluation\n") + features_clone->remove_subset(); + SG_REF(result_labels); + + /* evaluate */ + // results[i] = evaluation_criterion->evaluate(result_labels, labels); + // SG_DEBUG("result on fold %d is %f\n", i, results[i]) + + /* evtl. update xvalidation output class */ + fold->set_test_indices(test_idx); + fold->set_test_result(result_labels); + auto* true_labels = (CLabels*)labels->clone(); + fold->set_test_true_result(true_labels); + SG_UNREF(true_labels) + fold->post_update_results(); + // fold->set_evaluation_result(results[i]); + + /* clean up, remove subsets */ + labels->remove_subset(); + SG_UNREF(cloned_machine); + SG_UNREF(features_clone); + SG_UNREF(labels_clone); + // SG_UNREF(evaluation_criterion); + SG_UNREF(result_labels); + return fold; + } + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + return nullptr; +} + +std::unique_ptr ShogunOpenML::run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels) +{ + auto task_type = task->get_task_type(); + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + auto fold = std::make_unique(); + auto* cloned_machine = machine->clone()->as(); + auto* features_clone = features->clone()->as(); + auto* labels_clone = labels->clone()->as(); + fold->set_run_index(0); + fold->set_fold_index(0); + + /* train machine on training features */ + SG_SDEBUG("starting training\n") + cloned_machine->set_labels(labels_clone); + cloned_machine->train(features_clone); + SG_SDEBUG("finished training\n") + + auto* fold_machine = cloned_machine->clone()->as(); + fold->set_trained_machine(fold_machine); + SG_UNREF(fold_machine) + + /* apply machine to test features */ + SG_SDEBUG("starting evaluation\n") + SG_SDEBUG("%p\n", features_clone) + CLabels* result_labels = cloned_machine->apply(features_clone); + SG_SDEBUG("finished evaluation\n") + + /* evaluate */ + // results[i] = evaluation_criterion->evaluate(result_labels, labels); + // SG_DEBUG("result on fold %d is %f\n", i, results[i]) + + /* evtl. update xvalidation output class */ + fold->set_test_result(result_labels); + auto* true_labels = (CLabels*)labels->clone(); + fold->set_test_true_result(true_labels); + SG_UNREF(true_labels) + fold->post_update_results(); + // fold->set_evaluation_result(results[i]); + + // cleanup + SG_UNREF(cloned_machine); + SG_UNREF(features_clone); + SG_UNREF(labels_clone); + // SG_UNREF(evaluation_criterion); + SG_UNREF(result_labels); + return fold; + } + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED } return nullptr; } \ No newline at end of file diff --git a/src/shogun/io/openml/ShogunOpenML.h b/src/shogun/io/openml/ShogunOpenML.h index ff5257f19b8..88f0e19a842 100644 --- a/src/shogun/io/openml/ShogunOpenML.h +++ b/src/shogun/io/openml/ShogunOpenML.h @@ -8,6 +8,7 @@ #define SHOGUN_SHOGUNOPENML_H #include +#include #include #include @@ -44,12 +45,21 @@ namespace shogun model_to_flow(const std::shared_ptr& model); protected: - static std::shared_ptr run_model_on_fold( - const std::shared_ptr& model, - const std::shared_ptr& task, - const std::shared_ptr& X_train, index_t repeat_number, - index_t fold_number, const std::shared_ptr& y_train, - const std::shared_ptr& X_test); + static std::unique_ptr run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels, + const SGVector& train_idx, + const SGVector& test_id, + index_t repeat_number, + index_t fold_number); + + static std::unique_ptr run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels); private: /** diff --git a/src/shogun/io/openml/utils.h b/src/shogun/io/openml/utils.h index 122ce170098..b9f0d0ec1bf 100644 --- a/src/shogun/io/openml/utils.h +++ b/src/shogun/io/openml/utils.h @@ -15,65 +15,97 @@ namespace shogun { namespace openml_detail { + enum class BACKEND_FORMAT + { + JSON = 0, + XML = 1, + }; + /** * Checks the returned response from OpenML in JSON format * @param doc the parsed OpenML JSON format response */ - static void - check_response(const rapidjson::Document& doc, const std::string& type) + template < + BACKEND_FORMAT FormatT, + typename std::enable_if_t* = + nullptr> + const rapidjson::Value& + check_response(const std::string& val, const std::string& root_name) { + rapidjson::Document doc; + doc.Parse(val.c_str()); + if (SG_UNLIKELY(doc.HasMember("error"))) { const rapidjson::Value& root = doc["error"]; SG_SERROR( "Server error %s: %s\n", root["code"].GetString(), root["message"].GetString()) - return; } REQUIRE( - doc.HasMember(type.c_str()), - "Unexpected format of OpenML %s.\n", type.c_str()); + doc.HasMember(root_name.c_str()), + "Unexpected format of OpenML %s.\n", root_name.c_str()); + + return doc[root_name.c_str()]; } /** - * Helper function to add JSON objects as string in map - * @param v a RapidJSON GenericValue, i.e. string - * @param param_dict the map to write to - * @param name the name of the key + * Checks the returned response from OpenML in XML format + * @param doc the parsed OpenML XML format response */ - static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericValue>& v, - std::unordered_map& param_dict, - const std::string& name, bool required = false) + template < + BACKEND_FORMAT FormatT, + typename std::enable_if_t* = + nullptr> + void check_response(const std::string& val, const std::string& type) { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else if (required) - SG_SERROR( - "The field \"%s\" is expected to be a string!\n", - name.c_str()) - else - param_dict.emplace(name, ""); + SG_SNOTIMPLEMENTED } - /** - * Helper function to add JSON objects as string in map - * @param v a RapidJSON GenericObject, i.e. array - * @param param_dict the map to write to - * @param name the name of the key - */ - static SG_FORCED_INLINE void emplace_string_to_map( + template + static SG_FORCED_INLINE void add_string_to_struct( const rapidjson::GenericObject< true, rapidjson::GenericValue>>& v, - std::unordered_map& param_dict, - const std::string& name) + const std::string& name, T& custom_struct) { if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); + custom_struct = v[name.c_str()].GetString(); } + template + static SG_FORCED_INLINE void add_string_to_struct( + const rapidjson::GenericValue>& v, + const std::string& name, T& custom_struct) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + custom_struct = v[name.c_str()].GetString(); + } + + template + SG_FORCED_INLINE T must_return( + const std::string& name, + const rapidjson::GenericValue>& v) + { + SG_SNOTIMPLEMENTED + } + + template <> + SG_FORCED_INLINE std::string must_return( + const std::string& name, + const rapidjson::GenericValue>& v) + { + if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) + return v[name.c_str()].GetString(); + if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) + SG_SERROR( + "Found member \"%s\" but it is not a string", name.c_str()) + if (!v.HasMember(name.c_str())) + SG_SERROR( + "\"%s\" is not a member of the given object", name.c_str()) + return nullptr; + } + + template SG_FORCED_INLINE T return_if_possible( const std::string& name, @@ -96,8 +128,6 @@ namespace shogun "Found member \"%s\" but it is not a string", name.c_str()) if (!v.HasMember(name.c_str())) return ""; - SG_SERROR( - "\"%s\" is not a member of the given object", name.c_str()) return nullptr; } From b045a8a387e00d71a6215a272f36a8d09a3912a7 Mon Sep 17 00:00:00 2001 From: gf712 Date: Fri, 24 May 2019 14:32:13 +0100 Subject: [PATCH 15/32] fixes for new arff api --- src/shogun/io/openml/OpenMLData.cpp | 66 ++++++++++++++++------------ src/shogun/io/openml/OpenMLData.h | 17 +++---- src/shogun/io/openml/OpenMLRun.cpp | 16 +++---- src/shogun/io/openml/OpenMLSplit.cpp | 21 +++------ 4 files changed, 58 insertions(+), 62 deletions(-) diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp index 728cb6b1484..5f88fcbfb24 100644 --- a/src/shogun/io/openml/OpenMLData.cpp +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -23,7 +23,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) auto reader = OpenMLFile(api_key); auto return_string = reader.get("dataset_description", "json", id); - auto& dataset_description = check_response(return_string, "data_set_description"); + auto& dataset_description = check_response( + return_string, "data_set_description"); auto name = return_if_possible( "name", dataset_description.GetObject()); @@ -77,7 +78,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) param_vector; return_string = reader.get("data_features", "json", id); - auto& dataset_features = check_response(return_string, "data_features"); + auto& dataset_features = + check_response(return_string, "data_features"); for (const auto& param : dataset_features["feature"].GetArray()) { @@ -100,7 +102,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) std::vector> qualities_vector; return_string = reader.get("data_qualities", "json", id); - auto& data_qualities = check_response(return_string, "data_qualities"); + auto& data_qualities = + check_response(return_string, "data_qualities"); for (const auto& param : data_qualities["quality"].GetArray()) { @@ -129,14 +132,16 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) std::shared_ptr OpenMLData::get_features() noexcept { - if (!m_cached_features) - get_data(); - return m_cached_features; + // if (!m_cached_features) + // get_data(); + // return m_cached_features; + SG_SNOTIMPLEMENTED + return nullptr; } std::shared_ptr OpenMLData::get_features(const std::string& label) { - if (!m_cached_features) + if (m_cached_features.empty()) get_data(); auto find_label = std::find(m_feature_names.begin(), m_feature_names.end(), label); @@ -147,39 +152,42 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) feat_type_copy.erase(feat_type_copy.begin() + col_idx); for (const auto type : feat_type_copy) { - if (type == ARFFDeserializer::Attribute::STRING) + if (type == Attribute::STRING) SG_SNOTIMPLEMENTED } - // auto result = std::make_shared(); + std::shared_ptr> result; bool first = true; + size_t n_examples = 0; for (int i = 0; i < m_feature_types.size(); ++i) { if (i != col_idx && first) { - result.reset(m_cached_features->get_feature_obj(i) - ->as>()); + result.reset(m_cached_features[0]->as>()); + n_examples = result->get_num_vectors(); first = false; } if (i != col_idx) - result.reset( - result - ->create_merged_copy(m_cached_features->get_feature_obj(i)) - ->as>()); + { + REQUIRE( + n_examples == m_cached_features[i]->get_num_vectors(), + "Expected all features to have the same number of examples!\n") + result.reset(result->create_merged_copy(m_cached_features[i].get()) + ->as>()); + } } + REQUIRE(n_examples != 0, "No features extracted!\n") + // need to copy data as result is only in the stack and the data // will be gone at the end of the function - auto* copy_feat = SG_MALLOC( - float64_t, - m_feature_types.size() * m_cached_features->get_num_vectors()); + auto* copy_feat = SG_MALLOC(float64_t, m_feature_types.size() * n_examples); memcpy( copy_feat, result->get_feature_matrix().data(), - m_feature_types.size() * m_cached_features->get_num_vectors()); + m_feature_types.size() * m_cached_features.size()); result = std::make_shared>( - copy_feat, m_feature_types.size(), - m_cached_features->get_num_vectors()); + copy_feat, m_feature_types.size(), n_examples); return result; } @@ -197,7 +205,7 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) if (m_cached_labels && label_name == m_cached_label_name) return m_cached_labels; - if (!m_cached_features) + if (m_cached_features.empty()) get_data(); auto find_label = @@ -207,16 +215,16 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) auto col_idx = std::distance(m_feature_names.begin(), find_label); - auto target_label_as_feat = - std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); + std::shared_ptr target_label_as_feat = + m_cached_features[col_idx]; switch (m_feature_types[col_idx]) { // real features - case ARFFDeserializer::Attribute::REAL: - case ARFFDeserializer::Attribute::NUMERIC: - case ARFFDeserializer::Attribute::INTEGER: - case ARFFDeserializer::Attribute::DATE: + case Attribute::REAL: + case Attribute::NUMERIC: + case Attribute::INTEGER: + case Attribute::DATE: { auto casted_feat = std::dynamic_pointer_cast>( target_label_as_feat); @@ -228,7 +236,7 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) } break; // nominal features - case ARFFDeserializer::Attribute::NOMINAL: + case Attribute::NOMINAL: { auto casted_feat = std::dynamic_pointer_cast>( target_label_as_feat); diff --git a/src/shogun/io/openml/OpenMLData.h b/src/shogun/io/openml/OpenMLData.h index 25bf54f3084..022eb06ad8d 100644 --- a/src/shogun/io/openml/OpenMLData.h +++ b/src/shogun/io/openml/OpenMLData.h @@ -11,8 +11,8 @@ #include #include -#include #include +#include namespace shogun { @@ -90,7 +90,6 @@ namespace shogun */ std::shared_ptr get_labels(); - /** * Returns the dataset labels given the label_name * @return the labels @@ -101,16 +100,14 @@ namespace shogun * Returns the type of all attributes/features in the ARFF file * @return */ - SG_FORCED_INLINE std::vector - - get_feature_types() const noexcept + SG_FORCED_INLINE std::vector get_feature_types() const + noexcept { return m_feature_types; } - SG_FORCED_INLINE std::string - - get_default_target_attribute() const noexcept + SG_FORCED_INLINE std::string get_default_target_attribute() const + noexcept { return m_default_target_attribute; } @@ -153,9 +150,9 @@ namespace shogun m_param_qualities; std::string m_api_key; - std::shared_ptr m_cached_features; + std::vector> m_cached_features; std::vector m_feature_names; - std::vector m_feature_types; + std::vector m_feature_types; std::shared_ptr m_cached_labels; std::string m_cached_label_name; }; diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp index 465c5045c21..da28c07994d 100644 --- a/src/shogun/io/openml/OpenMLRun.cpp +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -88,22 +88,22 @@ std::shared_ptr OpenMLRun::run_flow_on_task( SG_SDEBUG("End of openml run: %s\n", xval_storage->to_string().c_str()); return std::make_shared( - nullptr, // uploader - nullptr, // uploader_name - nullptr, // setup_id - nullptr, // setup_string - nullptr, // parameter_settings + std::string{}, // uploader + std::string{}, // uploader_name + std::string{}, // setup_id + std::string{}, // setup_string + std::string{}, // parameter_settings std::vector{}, // evaluations std::vector{}, // fold_evaluations std::vector{}, // sample_evaluations - nullptr, // data_content + std::string{}, // data_content std::vector{}, // output_files task, // task flow, // flow - nullptr, // run_id + std::string{}, // run_id model, // model std::vector{}, // tags - nullptr // predictions_url + std::string{} // predictions_url ); } diff --git a/src/shogun/io/openml/OpenMLSplit.cpp b/src/shogun/io/openml/OpenMLSplit.cpp index 42d14520168..04d6206ed5f 100644 --- a/src/shogun/io/openml/OpenMLSplit.cpp +++ b/src/shogun/io/openml/OpenMLSplit.cpp @@ -29,27 +29,18 @@ OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) arff_parser.read(); auto arff_features = arff_parser.get_features(); REQUIRE( - arff_features->get_num_feature_obj() == 4, + arff_features.size() == 4, "Expected a ARFF file with 4 attributes: type, rowid, repeat and " "fold.\n") - auto train_test_feat = - std::shared_ptr(arff_features->get_feature_obj(0)); - auto rowid_feat = - std::shared_ptr(arff_features->get_feature_obj(1)); - auto repeat_feat = - std::shared_ptr(arff_features->get_feature_obj(2)); - auto fold_feat = - std::shared_ptr(arff_features->get_feature_obj(3)); - - auto type_vector = nominal_feature_to_vector(train_test_feat); - auto rowid_vector = dense_feature_to_vector(rowid_feat); - auto repeat_vector = dense_feature_to_vector(repeat_feat); - auto fold_vector = dense_feature_to_vector(fold_feat); + auto type_vector = nominal_feature_to_vector(arff_features[0]); + auto rowid_vector = dense_feature_to_vector(arff_features[1]); + auto repeat_vector = dense_feature_to_vector(arff_features[2]); + auto fold_vector = dense_feature_to_vector(arff_features[3]); std::array, 3> train_idx, test_idx; - for (int i = 0; i < train_test_feat->get_num_vectors(); ++i) + for (int i = 0; i < arff_features[0]->get_num_vectors(); ++i) { if (type_vector[i] == LabelType::TRAIN) { From 753f8ba16f0fbdeef4956ee656c4dd2bd49fcd14 Mon Sep 17 00:00:00 2001 From: gf712 Date: Wed, 29 May 2019 10:56:59 +0100 Subject: [PATCH 16/32] added metric evaluation --- src/shogun/io/openml/OpenMLData.cpp | 4 +- src/shogun/io/openml/OpenMLRun.cpp | 23 +++++---- src/shogun/io/openml/OpenMLRun.h | 15 +++--- src/shogun/io/openml/ShogunOpenML.cpp | 72 +++++++++++++++++++++++---- 4 files changed, 80 insertions(+), 34 deletions(-) diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp index 5f88fcbfb24..4906c6db345 100644 --- a/src/shogun/io/openml/OpenMLData.cpp +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -186,10 +186,8 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) copy_feat, result->get_feature_matrix().data(), m_feature_types.size() * m_cached_features.size()); - result = std::make_shared>( + return std::make_shared>( copy_feat, m_feature_types.size(), n_examples); - - return result; } std::shared_ptr OpenMLData::get_labels() diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp index da28c07994d..8cd761d6015 100644 --- a/src/shogun/io/openml/OpenMLRun.cpp +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -4,7 +4,6 @@ * Authors: Gil Hoben */ -#include #include #include #include @@ -47,7 +46,7 @@ std::shared_ptr OpenMLRun::run_flow_on_task( SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") } - auto* xval_storage = new CrossValidationStorage(); + auto xval_storage = std::make_shared(); if (task->get_split()->contains_splits()) { @@ -64,11 +63,12 @@ std::shared_ptr OpenMLRun::run_flow_on_task( for (auto fold_idx : range(task->get_num_fold())) { SGVector train_i_idx( - train_idx[repeat_idx][fold_idx].data(), - train_idx[repeat_idx][fold_idx].size()); + train_idx[repeat_idx][fold_idx].begin(), + train_idx[repeat_idx][fold_idx].end()); SGVector test_i_idx( - train_idx[repeat_idx][fold_idx].data(), - train_idx[repeat_idx][fold_idx].size()); + test_idx[repeat_idx][fold_idx].begin(), + test_idx[repeat_idx][fold_idx].end()); + xval_storage->append_fold_result( ShogunOpenML::run_model_on_fold( machine, task, features, labels, train_i_idx, @@ -93,9 +93,7 @@ std::shared_ptr OpenMLRun::run_flow_on_task( std::string{}, // setup_id std::string{}, // setup_string std::string{}, // parameter_settings - std::vector{}, // evaluations - std::vector{}, // fold_evaluations - std::vector{}, // sample_evaluations + xval_storage, // xval_storage std::string{}, // data_content std::vector{}, // output_files task, // task @@ -119,7 +117,10 @@ void OpenMLRun::to_filesystem(const std::string& directory) const SG_SNOTIMPLEMENTED } -void OpenMLRun::publish() const +void OpenMLRun::publish() const {SG_SNOTIMPLEMENTED} + +std::unique_ptr OpenMLRun::to_xml() const { - SG_SNOTIMPLEMENTED + + return std::unique_ptr(); } diff --git a/src/shogun/io/openml/OpenMLRun.h b/src/shogun/io/openml/OpenMLRun.h index 3619bed62d1..86e3379be31 100644 --- a/src/shogun/io/openml/OpenMLRun.h +++ b/src/shogun/io/openml/OpenMLRun.h @@ -8,6 +8,7 @@ #define SHOGUN_OPENMLRUN_H #include +#include #include #include @@ -20,9 +21,7 @@ namespace shogun { const std::string& uploader, const std::string& uploader_name, const std::string& setup_id, const std::string& setup_string, const std::string& parameter_settings, - std::vector evaluations, - std::vector fold_evaluations, - std::vector sample_evaluations, + std::shared_ptr xval_storage, const std::string& data_content, std::vector output_files, std::shared_ptr task, std::shared_ptr flow, @@ -31,9 +30,7 @@ namespace shogun { : m_uploader(uploader), m_uploader_name(uploader_name), m_setup_id(setup_id), m_setup_string(setup_string), m_parameter_settings(parameter_settings), - m_evaluations(std::move(evaluations)), - m_fold_evaluations(std::move(fold_evaluations)), - m_sample_evaluations(std::move(sample_evaluations)), + m_xval_storage(xval_storage), m_data_content(data_content), m_output_files(std::move(output_files)), m_task(std::move(task)), m_flow(std::move(flow)), m_run_id(run_id), @@ -55,6 +52,8 @@ namespace shogun { void to_filesystem(const std::string& directory) const; + std::unique_ptr to_xml() const; + void publish() const; private: @@ -63,9 +62,7 @@ namespace shogun { std::string m_setup_id; std::string m_setup_string; std::string m_parameter_settings; - std::vector m_evaluations; - std::vector m_fold_evaluations; - std::vector m_sample_evaluations; + std::shared_ptr m_xval_storage; std::string m_data_content; std::vector m_output_files; std::shared_ptr m_task; diff --git a/src/shogun/io/openml/ShogunOpenML.cpp b/src/shogun/io/openml/ShogunOpenML.cpp index ed73099f1c3..87987e3036f 100644 --- a/src/shogun/io/openml/ShogunOpenML.cpp +++ b/src/shogun/io/openml/ShogunOpenML.cpp @@ -6,6 +6,8 @@ * Authors: Gil Hoben */ +#include +#include #include #include @@ -310,6 +312,30 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( { auto task_type = task->get_task_type(); + CEvaluation* evaluation_criterion = nullptr; + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + evaluation_criterion = new CAccuracyMeasure(); + break; + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + evaluation_criterion = new CMeanAbsoluteError(); + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + switch (task_type) { case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: @@ -324,8 +350,6 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( // shared auto* features_clone = features->clone()->as(); auto* labels_clone = labels->clone()->as(); - // auto* evaluation_criterion = - // (CEvaluation*)m_evaluation_criterion->clone(); /* evtl. update xvalidation output class */ fold->set_run_index(repeat_idx); @@ -371,8 +395,10 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( SG_REF(result_labels); /* evaluate */ - // results[i] = evaluation_criterion->evaluate(result_labels, labels); - // SG_DEBUG("result on fold %d is %f\n", i, results[i]) + auto result = + evaluation_criterion->evaluate(result_labels, labels_clone); + SG_SINFO( + "result on repeat %d fold %d is %f\n", repeat_idx, fold_idx, result) /* evtl. update xvalidation output class */ fold->set_test_indices(test_idx); @@ -381,18 +407,17 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( fold->set_test_true_result(true_labels); SG_UNREF(true_labels) fold->post_update_results(); - // fold->set_evaluation_result(results[i]); + fold->set_evaluation_result(result); /* clean up, remove subsets */ labels->remove_subset(); SG_UNREF(cloned_machine); SG_UNREF(features_clone); SG_UNREF(labels_clone); - // SG_UNREF(evaluation_criterion); SG_UNREF(result_labels); + delete evaluation_criterion; return fold; } - break; case OpenMLTask::TaskType::LEARNING_CURVE: SG_SNOTIMPLEMENTED case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: @@ -417,6 +442,30 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( { auto task_type = task->get_task_type(); + CEvaluation* evaluation_criterion = nullptr; + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + evaluation_criterion = new CAccuracyMeasure(); + break; + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + evaluation_criterion = new CMeanAbsoluteError(); + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + switch (task_type) { case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: @@ -446,8 +495,9 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( SG_SDEBUG("finished evaluation\n") /* evaluate */ - // results[i] = evaluation_criterion->evaluate(result_labels, labels); - // SG_DEBUG("result on fold %d is %f\n", i, results[i]) + auto result = + evaluation_criterion->evaluate(result_labels, labels_clone); + SG_SINFO("result is %f\n", result) /* evtl. update xvalidation output class */ fold->set_test_result(result_labels); @@ -455,14 +505,14 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( fold->set_test_true_result(true_labels); SG_UNREF(true_labels) fold->post_update_results(); - // fold->set_evaluation_result(results[i]); + fold->set_evaluation_result(result); // cleanup SG_UNREF(cloned_machine); SG_UNREF(features_clone); SG_UNREF(labels_clone); - // SG_UNREF(evaluation_criterion); SG_UNREF(result_labels); + delete evaluation_criterion; return fold; } case OpenMLTask::TaskType::LEARNING_CURVE: From f1909405bffe093be037dc0663cc82eeb6582384 Mon Sep 17 00:00:00 2001 From: gf712 Date: Fri, 3 May 2019 14:48:18 +0100 Subject: [PATCH 17/32] initial openml code --- src/shogun/io/OpenmlFlow.cpp | 100 +++++++++++++++++++++++ src/shogun/io/OpenmlFlow.h | 154 +++++++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 src/shogun/io/OpenmlFlow.cpp create mode 100644 src/shogun/io/OpenmlFlow.h diff --git a/src/shogun/io/OpenmlFlow.cpp b/src/shogun/io/OpenmlFlow.cpp new file mode 100644 index 00000000000..f9cdb19af00 --- /dev/null +++ b/src/shogun/io/OpenmlFlow.cpp @@ -0,0 +1,100 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifdef HAVE_CURL + +#include +#include "OpenmlFlow.h" + + +using namespace shogun; + +size_t writer(char *data, size_t size, size_t nmemb, std::string* buffer_in) +{ + // adapted from https://stackoverflow.com/a/5780603 + // Is there anything in the buffer? + if (buffer_in->empty()) + { + // Append the data to the buffer + buffer_in->append(data, size * nmemb); + + return size * nmemb; + } + + return 0; +} + +const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; +const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::dataset_description = "/data/{}"; +const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; +const char* OpenMLReader::data_features = "/data/features/{}"; +const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; +const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; +const char* OpenMLReader::flow_file = "/flow/{}"; + +const std::unordered_map + OpenMLReader::m_format_options = {{"xml", xml_server}, + {"json", json_server}}; +const std::unordered_map + OpenMLReader::m_request_options = { + + {"dataset_description", dataset_description}, + {"list_data_qualities", list_data_qualities}, + {"data_features", data_features}, + {"list_dataset_qualities", list_dataset_qualities}, + {"list_dataset_filter", list_dataset_filter}, + {"flow_file", flow_file}}; + +OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) +{ +} + +void OpenMLReader::post(const std::string& request, const std::string& data) +{ +} + +void OpenMLReader::openml_curl_request_helper(const std::string& url) +{ + CURL* curl_handle = nullptr; + + curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.") + return; + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_HTTPGET,1); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); + + CURLcode res = curl_easy_perform(curl_handle); + + openml_curl_error_helper(res); + + curl_easy_cleanup(curl_handle); +} + +void OpenMLReader::openml_curl_error_helper(CURLcode code) { + +} + + +void OpenMLFlow::download_flow() +{ + + auto reader = OpenMLReader(m_api_key); + auto return_string = reader.get("flow_file", "json", m_flow_id); +} + +void OpenMLFlow::upload_flow(const OpenMLFlow& flow) +{ +} + +#endif // HAVE_CURL diff --git a/src/shogun/io/OpenmlFlow.h b/src/shogun/io/OpenmlFlow.h new file mode 100644 index 00000000000..6787df3e252 --- /dev/null +++ b/src/shogun/io/OpenmlFlow.h @@ -0,0 +1,154 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLFLOW_H +#define SHOGUN_OPENMLFLOW_H + +#ifdef HAVE_CURL + +#include + +#include +#include +#include +#include +#include +#include + +namespace shogun +{ + class OpenMLReader + { + + public: + explicit OpenMLReader(const std::string& api_key); + + /** + * Returns a string returned by the server given a request. + * Raises an error if the returned code is not 200. + * Additional arguments can be passed to the request, + * which are then concatenated with a "/" character. + * + * @tparam Args argument type pack, should all be std::string + * @param request the request name, see m_request_options + * @param format the format to return the data in, see m_format_options + * @param args the additional arguments to be passed to request + * @return the returned stream from the server if the return code is 200 + */ + template + std::string + get(const std::string& request, const std::string& format, Args... args) + { + auto find_format = m_format_options.find(format); + if (find_format == m_format_options.end()) + { + SG_SERROR( + "The provided format \"%s\" is not available\n", + format.c_str()) + } + auto find_request = m_request_options.find(request); + if (find_request == m_request_options.end()) + { + SG_SERROR( + "Could not find a way to solve the request \"%s\"\n", + request.c_str()) + } + std::string request_format = find_format->second; + std::string request_path = find_request->second; + + // get additional args and concatenate them with "/" + if (sizeof...(Args) > 0) + { + if (request_path.substr(request_path.size() - 2) == "{}") + { + request_path = + request_path.substr(0, request_path.size() - 2); + } + else + { + SG_SERROR( + "The provided request \"%s\" cannot handle additional " + "args.\n", + request.c_str()) + } + std::vector args_vec = {args...}; + std::string args_string = std::accumulate( + args_vec.begin() + 1, args_vec.end(), args_vec.front(), + [](std::string s0, std::string& s1) { + return s0 += "/" + s1; + }); + request_path += args_string; + } + + std::string url = request_format + request_path + "?" + m_api_key; + + openml_curl_request_helper(url); + + return m_curl_response_buffer; + } + + void post(const std::string& request, const std::string& data); + + private: + + std::string m_curl_response_buffer; + + /** + * Initialises CURL session and gets the data. + * This function also handles the response code from the server. + * + * @param url the url to query + */ + void openml_curl_request_helper(const std::string& url); + + /** + * Handles all possible codes + * + * @param code the code returned by the query + */ + void openml_curl_error_helper(CURLcode code); + + std::string m_api_key; + + static const char* xml_server; + static const char* json_server; + + static const std::unordered_map + m_format_options; + static const std::unordered_map + m_request_options; + + /* DATA API */ + static const char* dataset_description; + static const char* list_data_qualities; + static const char* data_features; + static const char* list_dataset_qualities; + static const char* list_dataset_filter; + + /* FLOW API */ + static const char* flow_file; + }; + + class OpenMLFlow + { + + public: + explicit OpenMLFlow( + const std::string& api_key, const std::string& flow_id) + : m_api_key(api_key), m_flow_id(flow_id){}; + + void download_flow(); + + static void upload_flow(const OpenMLFlow& flow); + + private: + std::string m_api_key; + std::string m_flow_id; + }; +} // namespace shogun +#endif // HAVE_CURL + +#endif // SHOGUN_OPENMLFLOW_H From 9b331a66cf4ee979b630e843e7a2fc98b05b3842 Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 7 May 2019 14:09:35 +0100 Subject: [PATCH 18/32] basic flow interface working with python --- cmake/ShogunUtils.cmake | 58 ++-- src/interfaces/swig/IO.i | 2 + src/interfaces/swig/IO_includes.i | 1 + src/shogun/CMakeLists.txt | 339 ++++++++++--------- src/shogun/io/OpenMLFlow.cpp | 187 ++++++++++ src/shogun/io/{OpenmlFlow.h => OpenMLFlow.h} | 92 ++++- src/shogun/io/OpenmlFlow.cpp | 100 ------ 7 files changed, 471 insertions(+), 308 deletions(-) create mode 100644 src/shogun/io/OpenMLFlow.cpp rename src/shogun/io/{OpenmlFlow.h => OpenMLFlow.h} (59%) delete mode 100644 src/shogun/io/OpenmlFlow.cpp diff --git a/cmake/ShogunUtils.cmake b/cmake/ShogunUtils.cmake index 7c94069528a..eca6cdad079 100644 --- a/cmake/ShogunUtils.cmake +++ b/cmake/ShogunUtils.cmake @@ -75,31 +75,31 @@ ENDMACRO() # based on compiz_discover_tests function (shogun_discover_tests EXECUTABLE) - add_dependencies (${EXECUTABLE} discover_gtest_tests) - - add_custom_command (TARGET ${EXECUTABLE} - POST_BUILD - COMMAND ${CMAKE_COMMAND} -D UNIT_TEST_CMD=${CMAKE_BINARY_DIR}/bin/${EXECUTABLE} - -D DISCOVER_CMD=${CMAKE_BINARY_DIR}/bin/discover_gtest_tests - -D WORKING_DIR=${CMAKE_CURRENT_BINARY_DIR} - -P ${CMAKE_MODULE_PATH}/discover_unit_tests.cmake - COMMENT "Discovering Tests in ${EXECUTABLE}" - DEPENDS - VERBATIM) + add_dependencies (${EXECUTABLE} discover_gtest_tests) + + add_custom_command (TARGET ${EXECUTABLE} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -D UNIT_TEST_CMD=${CMAKE_BINARY_DIR}/bin/${EXECUTABLE} + -D DISCOVER_CMD=${CMAKE_BINARY_DIR}/bin/discover_gtest_tests + -D WORKING_DIR=${CMAKE_CURRENT_BINARY_DIR} + -P ${CMAKE_MODULE_PATH}/discover_unit_tests.cmake + COMMENT "Discovering Tests in ${EXECUTABLE}" + DEPENDS + VERBATIM) endfunction () MACRO(AddMetaIntegrationTest META_TARGET CONDITION) - IF (${CONDITION}) - add_test(NAME integration_meta_${META_TARGET}-${NAME_WITH_DIR} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMAND meta_example_integration_tester ${REL_DIR} ${NAME}.dat ${META_TARGET} generated_results reference_results) - set_tests_properties( - integration_meta_${META_TARGET}-${NAME_WITH_DIR} - PROPERTIES - LABELS "integration" - DEPENDS generated_${META_TARGET}-${NAME_WITH_DIR} - ) - ENDIF() + IF (${CONDITION}) + add_test(NAME integration_meta_${META_TARGET}-${NAME_WITH_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMAND meta_example_integration_tester ${REL_DIR} ${NAME}.dat ${META_TARGET} generated_results reference_results) + set_tests_properties( + integration_meta_${META_TARGET}-${NAME_WITH_DIR} + PROPERTIES + LABELS "integration" + DEPENDS generated_${META_TARGET}-${NAME_WITH_DIR} + ) + ENDIF() ENDMACRO() MACRO(AddLibShogunExample EXAMPLE_CPP) @@ -238,12 +238,12 @@ endfunction() function(SET_LINALG_BACKEND COMPONENT FLAG) OPTION(USE_EIGEN3_${FLAG} "Use ${COMPONENT} Eigen3" ON) CMAKE_DEPENDENT_OPTION( - USE_VIENNACL_${FLAG} "Use ${COMPONENT} ViennaCL" OFF - "VIENNACL_FOUND;USE_VIENNACL;NOT USE_EIGEN3_${FLAG}" ON) + USE_VIENNACL_${FLAG} "Use ${COMPONENT} ViennaCL" OFF + "VIENNACL_FOUND;USE_VIENNACL;NOT USE_EIGEN3_${FLAG}" ON) if(NOT ${COMPONENT}) - set(${COMPONENT} EIGEN3 CACHE STRING - "Set linear algebra backend ${COMPONENT}: EIGEN3, VIENNACL" - FORCE) + set(${COMPONENT} EIGEN3 CACHE STRING + "Set linear algebra backend ${COMPONENT}: EIGEN3, VIENNACL" + FORCE) endif() if (${COMPONENT} STREQUAL "EIGEN3") @@ -277,7 +277,7 @@ function(GET_META_EXAMPLE_VARS META_EXAMPLE EX_NAME REL_DIR NAME_WITH_DIR) endfunction() function(GET_INTERFACE_VARS INTERFACE DIRECTORY EXTENSION) - string(REGEX MATCH "INTERFACE_([a-zA-Z]+)" _dir ${INTERFACE}) + string(REGEX MATCH "INTERFACE_([a-zA-Z]+)" _dir ${INTERFACE}) STRING(TOLOWER "${CMAKE_MATCH_1}" _dir) SET(${DIRECTORY} ${_dir} PARENT_SCOPE) @@ -327,4 +327,4 @@ function(ADD_SHOGUN_BENCHMARK REL_BENCHMARK_NAME) if(ARGN) set_tests_properties(${BENCHMARK_NAME} PROPERTIES ${ARGN}) endif() -endfunction() +endfunction() \ No newline at end of file diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index ffc058f5e8a..474bba3ba9f 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -30,6 +30,7 @@ %shared_ptr(shogun::MemoryMappedFile) %shared_ptr(shogun::Compressor) +%shared_ptr(shogun::OpenMLFlow) #ifdef USE_BOOL %shared_ptr(shogun::StreamingFileFromSparseFeatures) %shared_ptr(shogun::StreamingFileFromDenseFeatures) @@ -183,6 +184,7 @@ namespace shogun %include %include +%include %include %include diff --git a/src/interfaces/swig/IO_includes.i b/src/interfaces/swig/IO_includes.i index d1d1fcbc8c8..b5b27ef02f7 100644 --- a/src/interfaces/swig/IO_includes.i +++ b/src/interfaces/swig/IO_includes.i @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/src/shogun/CMakeLists.txt b/src/shogun/CMakeLists.txt index 78a40e236a9..82491ab2fb2 100644 --- a/src/shogun/CMakeLists.txt +++ b/src/shogun/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES( - ${CMAKE_SOURCE_DIR}/src - ${CMAKE_BINARY_DIR}/src + ${CMAKE_SOURCE_DIR}/src + ${CMAKE_BINARY_DIR}/src ) include(GenerateExportHeader) @@ -46,23 +46,23 @@ set(SHOGUN_CLING_LIBRARY_DIR "\"${CMAKE_INSTALL_PREFIX}/${SHOGUN_LIB_INSTALL}\"" if (MSVC OR BUILD_BENCHMARKS) SET(LIBSHOGUN_BUILD_STATIC ON - CACHE BOOL "Build libshogun static library" FORCE) + CACHE BOOL "Build libshogun static library" FORCE) endif() # Allow to hide non-bsd compatible codes OPTION(LICENSE_GPL_SHOGUN "Include GPL codes of Shogun (non-BSD compatible) in build" ON) SET(USE_GPL_SHOGUN 0) IF (LICENSE_GPL_SHOGUN) - SET(SHOGUN_GPL_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/src/gpl) - IF(EXISTS "${SHOGUN_GPL_INCLUDE_DIR}/shogun") - SET(USE_GPL_SHOGUN 1) + SET(SHOGUN_GPL_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/src/gpl) + IF(EXISTS "${SHOGUN_GPL_INCLUDE_DIR}/shogun") + SET(USE_GPL_SHOGUN 1) - FILE(GLOB_RECURSE GPL_LIBSHOGUN_SRC ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_CPP} ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_C}) - FILE(GLOB_RECURSE GPL_LIBSHOGUN_HEADERS ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_HEADER}) + FILE(GLOB_RECURSE GPL_LIBSHOGUN_SRC ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_CPP} ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_C}) + FILE(GLOB_RECURSE GPL_LIBSHOGUN_HEADERS ${SHOGUN_GPL_INCLUDE_DIR}/*.${EXT_SRC_HEADER}) - LIST(APPEND LIBSHOGUN_SRC ${GPL_LIBSHOGUN_SRC}) - LIST(APPEND LIBSHOGUN_HEADERS ${GPL_LIBSHOGUN_HEADERS}) - INCLUDE_DIRECTORIES(${SHOGUN_GPL_INCLUDE_DIR}) + LIST(APPEND LIBSHOGUN_SRC ${GPL_LIBSHOGUN_SRC}) + LIST(APPEND LIBSHOGUN_HEADERS ${GPL_LIBSHOGUN_HEADERS}) + INCLUDE_DIRECTORIES(${SHOGUN_GPL_INCLUDE_DIR}) SET(USE_GPL_SHOGUN 1) ELSE() @@ -72,7 +72,7 @@ ENDIF() OPTION(USE_SVMLIGHT "SVMLight" ON) IF(USE_SVMLIGHT AND NOT USE_GPL_SHOGUN) - MESSAGE(FATAL_ERROR "Can only use SVMLight when GPL codes are included. Disable by USE_SVMLIGHT=OFF") + MESSAGE(FATAL_ERROR "Can only use SVMLight when GPL codes are included. Disable by USE_SVMLIGHT=OFF") ENDIF() # add target to compile the libshogun sources @@ -84,13 +84,13 @@ ENDIF() # Generate versionstring.h ADD_CUSTOM_TARGET( - version - ${CMAKE_COMMAND} - -D SRC=${CMAKE_CURRENT_SOURCE_DIR}/lib/versionstring.h.in - -D DST=${CMAKE_CURRENT_BINARY_DIR}/lib/versionstring.h - -D ROOT_DIR=${CMAKE_SOURCE_DIR} - -P ${CMAKE_SOURCE_DIR}/cmake/version.cmake - COMMENT "Generating version header" + version + ${CMAKE_COMMAND} + -D SRC=${CMAKE_CURRENT_SOURCE_DIR}/lib/versionstring.h.in + -D DST=${CMAKE_CURRENT_BINARY_DIR}/lib/versionstring.h + -D ROOT_DIR=${CMAKE_SOURCE_DIR} + -P ${CMAKE_SOURCE_DIR}/cmake/version.cmake + COMMENT "Generating version header" ) add_dependencies(libshogun version) @@ -109,12 +109,12 @@ FOREACH(template ${LIBSHOGUN_SRC_TMP}) ENDFOREACH() ADD_CUSTOM_COMMAND(OUTPUT ${generated_cpp} - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${template} ${generated_cpp} - COMMAND ${PYTHON_EXECUTABLE} ${generator_script} ${generated_cpp} -in ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt - DEPENDS ${template} ${generator_script} version ${LIBSHOGUN_HEADERS} ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMENT "Generating ${generated_cpp}" - ) + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${template} ${generated_cpp} + COMMAND ${PYTHON_EXECUTABLE} ${generator_script} ${generated_cpp} -in ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt + DEPENDS ${template} ${generator_script} version ${LIBSHOGUN_HEADERS} ${CMAKE_CURRENT_BINARY_DIR}/headers_list.txt + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Generating ${generated_cpp}" + ) ADD_CUSTOM_TARGET(${generated_target} DEPENDS ${generated_cpp}) add_dependencies(libshogun ${generated_target}) @@ -125,18 +125,18 @@ ENDFOREACH() add_library(shogun SHARED $ ${CMAKE_CURRENT_BINARY_DIR}/lib/config.h) generate_export_header(shogun) set_target_properties( - shogun PROPERTIES - VERSION ${LIBSHOGUNVER} - SOVERSION ${LIBSHOGUNSO} - INTERFACE_shogun_MAJOR_VERSION ${SHOGUN_VERSION_MAJOR}) + shogun PROPERTIES + VERSION ${LIBSHOGUNVER} + SOVERSION ${LIBSHOGUNSO} + INTERFACE_shogun_MAJOR_VERSION ${SHOGUN_VERSION_MAJOR}) set_property(TARGET shogun APPEND PROPERTY COMPATIBLE_INTERFACE_STRING ${SHOGUN_VERSION_MAJOR}) target_include_directories(shogun PUBLIC - $ - $ - $ -) + $ + $ + $ + ) IF (${USE_GPL_SHOGUN}) - target_include_directories(shogun PUBLIC $) + target_include_directories(shogun PUBLIC $) ENDIF() get_property(dirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) @@ -149,11 +149,11 @@ endforeach() add_library(shogun_deps INTERFACE) target_link_libraries(shogun_deps INTERFACE shogun) target_include_directories(shogun_deps INTERFACE - $ - $ -) + $ + $ + ) IF (${USE_GPL_SHOGUN}) - target_include_directories(shogun_deps INTERFACE $) + target_include_directories(shogun_deps INTERFACE $) ENDIF() # add target for static library if enabled @@ -161,10 +161,10 @@ if (LIBSHOGUN_BUILD_STATIC) add_library(shogun-static STATIC $ ${CMAKE_CURRENT_BINARY_DIR}/lib/config.h) set_property(TARGET shogun-static PROPERTY OUTPUT_NAME shogun) target_include_directories(shogun-static PUBLIC - $ - $ - $ - ) + $ + $ + $ + ) IF (${USE_GPL_SHOGUN}) target_include_directories(shogun-static PUBLIC $) ENDIF() @@ -181,9 +181,9 @@ IF(MSVC) IF(MSVC_VERSION VERSION_LESS "1700") include(external/MSIntTypes) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC - $ - $ - ) + $ + $ + ) ENDIF() target_link_libraries(shogun PUBLIC winmm Shlwapi) @@ -301,49 +301,49 @@ endif() include(external/StanMath) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ -) + $ + $ + ) ####### LINALG # ViennaCL detection SHOGUN_DEPENDENCIES( - LIBRARY ViennaCL - SCOPE PRIVATE - VERSION ${VIENNACL_VERSION_MINIMUM} - CONFIG_FLAG HAVE_VIENNACL) + LIBRARY ViennaCL + SCOPE PRIVATE + VERSION ${VIENNACL_VERSION_MINIMUM} + CONFIG_FLAG HAVE_VIENNACL) ####### /LINALG FIND_PACKAGE(rxcpp) IF(NOT rxcpp_FOUND) - include(external/rxcpp) - SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ - ) + include(external/rxcpp) + SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM + $ + $ + ) ELSE() - SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM ${rxcpp_INCLUDE_DIR}) + SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM ${rxcpp_INCLUDE_DIR}) ENDIF() # TFLogger package FIND_PACKAGE(TFLogger 0.1.0 CONFIG) IF (TFLogger_FOUND) - SET(HAVE_TFLOGGER 1) - SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE SYSTEM ${TFLogger_INCLUDE_DIR}) - target_link_libraries(shogun PRIVATE tflogger::tflogger) + SET(HAVE_TFLOGGER 1) + SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE SYSTEM ${TFLogger_INCLUDE_DIR}) + target_link_libraries(shogun PRIVATE tflogger::tflogger) ENDIF() #### LAPACK @@ -354,9 +354,9 @@ CHECK_CXX_SOURCE_COMPILES("#include \n int main(int argc, char** argv) IF (NOT HAVE_STD_VARIANT) include(external/variant) SHOGUN_INCLUDE_DIRS(SCOPE PUBLIC SYSTEM - $ - $ - ) + $ + $ + ) ENDIF() # spdlog @@ -376,9 +376,9 @@ ELSE() ENDIF() SHOGUN_DEPENDENCIES( - LIBRARY GLPK - SCOPE PRIVATE - CONFIG_FLAG USE_GLPK) + LIBRARY GLPK + SCOPE PRIVATE + CONFIG_FLAG USE_GLPK) SHOGUN_DEPENDENCIES( LIBRARY LibArchive @@ -391,19 +391,24 @@ SHOGUN_DEPENDENCIES( CONFIG_FLAG USE_CPLEX) SHOGUN_DEPENDENCIES( - LIBRARY ARPACK - SCOPE PRIVATE - CONFIG_FLAG HAVE_ARPACK) + LIBRARY CPLEX + SCOPE PRIVATE + CONFIG_FLAG USE_CPLEX) SHOGUN_DEPENDENCIES( - LIBRARY Mosek - SCOPE PRIVATE - CONFIG_FLAG USE_MOSEK) + LIBRARY ARPACK + SCOPE PRIVATE + CONFIG_FLAG HAVE_ARPACK) + +SHOGUN_DEPENDENCIES( + LIBRARY Mosek + SCOPE PRIVATE + CONFIG_FLAG USE_MOSEK) SHOGUN_DEPENDENCIES( - LIBRARY Protobuf - SCOPE PUBLIC - CONFIG_FLAG HAVE_PROTOBUF) + LIBRARY Protobuf + SCOPE PUBLIC + CONFIG_FLAG HAVE_PROTOBUF) IF (PROTOBUF_FOUND AND ENABLE_PROTOBUF) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/io/protobuf) @@ -414,12 +419,12 @@ IF (PROTOBUF_FOUND AND ENABLE_PROTOBUF) SET(name "${CMAKE_CURRENT_BINARY_DIR}/io/protobuf/${FIL_WE}") ADD_CUSTOM_COMMAND( - OUTPUT "${name}.pb.cc" "${name}.pb.h" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${FIL} - --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/io/protobuf - --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/io/protobuf - COMMENT "Running C++ protocol buffer compiler on ${FIL_WE}.proto" - ) + OUTPUT "${name}.pb.cc" "${name}.pb.h" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ${FIL} + --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/io/protobuf + --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/io/protobuf + COMMENT "Running C++ protocol buffer compiler on ${FIL_WE}.proto" + ) ADD_CUSTOM_TARGET("${FIL_WE}Protobuf" DEPENDS "${name}.pb.cc") add_dependencies(libshogun "${FIL_WE}Protobuf") set_source_files_properties(${name}.pb.cc PROPERTIES GENERATED ON) @@ -441,40 +446,40 @@ if (NOT WIN32) # FIXME: HDF5 linking on WIN32 is broken. # at least with the hdf5 supplied in anaconda SHOGUN_DEPENDENCIES( - LIBRARY HDF5 - SCOPE PUBLIC - CONFIG_FLAG HAVE_HDF5) + LIBRARY HDF5 + SCOPE PUBLIC + CONFIG_FLAG HAVE_HDF5) endif () SHOGUN_DEPENDENCIES( - LIBRARY CURL - SCOPE PRIVATE - CONFIG_FLAG HAVE_CURL) + LIBRARY CURL + SCOPE PRIVATE + CONFIG_FLAG HAVE_CURL) SHOGUN_DEPENDENCIES( - LIBRARY ZLIB - SCOPE PRIVATE - CONFIG_FLAG USE_GZIP) + LIBRARY ZLIB + SCOPE PRIVATE + CONFIG_FLAG USE_GZIP) SHOGUN_DEPENDENCIES( - LIBRARY BZip2 - SCOPE PRIVATE - CONFIG_FLAG USE_BZIP2) + LIBRARY BZip2 + SCOPE PRIVATE + CONFIG_FLAG USE_BZIP2) SHOGUN_DEPENDENCIES( - LIBRARY LibLZMA - SCOPE PRIVATE - CONFIG_FLAG USE_LZMA) + LIBRARY LibLZMA + SCOPE PRIVATE + CONFIG_FLAG USE_LZMA) SHOGUN_DEPENDENCIES( - LIBRARY SNAPPY - SCOPE PRIVATE - CONFIG_FLAG USE_SNAPPY) + LIBRARY SNAPPY + SCOPE PRIVATE + CONFIG_FLAG USE_SNAPPY) SHOGUN_DEPENDENCIES( - LIBRARY LZO - SCOPE PRIVATE - CONFIG_FLAG USE_LZO) + LIBRARY LZO + SCOPE PRIVATE + CONFIG_FLAG USE_LZO) #integration OPTION(OpenCV "OpenCV Integration" OFF) @@ -502,25 +507,25 @@ IF(BUNDLE_NLOPT) SET(POSTLINKFLAGS ${POSTLINKFLAGS} ${NLOPT_LIBRARIES}) ELSE() SHOGUN_DEPENDENCIES( - LIBRARY NLopt - SCOPE PRIVATE - CONFIG_FLAG HAVE_NLOPT) + LIBRARY NLopt + SCOPE PRIVATE + CONFIG_FLAG HAVE_NLOPT) ENDIF() SHOGUN_DEPENDENCIES( - LIBRARY LpSolve - SCOPE PRIVATE - CONFIG_FLAG USE_LPSOLVE) + LIBRARY LpSolve + SCOPE PRIVATE + CONFIG_FLAG USE_LPSOLVE) SHOGUN_DEPENDENCIES( - LIBRARY ColPack - SCOPE PRIVATE - CONFIG_FLAG HAVE_COLPACK) + LIBRARY ColPack + SCOPE PRIVATE + CONFIG_FLAG HAVE_COLPACK) SHOGUN_DEPENDENCIES( - LIBRARY ARPREC - SCOPE PRIVATE - CONFIG_FLAG HAVE_ARPREC) + LIBRARY ARPREC + SCOPE PRIVATE + CONFIG_FLAG HAVE_ARPREC) ############################ HMM OPTION(USE_HMMDEBUG "HMM debug mode" OFF) @@ -547,7 +552,7 @@ OPTION(USE_LOGCACHE "Use (1+exp(x)) log cache (is much faster but less accurate) ################## linker optimisations OPTION(INCREMENTAL_LINKING "Enable incremantal linking") SET(INCREMENTAL_LINKING_DIR ${CMAKE_BINARY_DIR}/linker_cache - CACHE PATH "Incremental linking cache directory") + CACHE PATH "Incremental linking cache directory") OPTION(ENABLE_LTO "Enable link-time optimisation") include(LDGOLD) @@ -569,7 +574,7 @@ target_link_libraries(shogun PUBLIC ${POSTLINKFLAGS}) IF (BUILD_BENCHMARKS) IF (NOT BUILD_TYPE_UC STREQUAL "RELEASE") MESSAGE(WARNING "You are building benchmark against non-Release mode!" - "The results might not reflect the actual runtimes...") + "The results might not reflect the actual runtimes...") ENDIF() find_package(benchmark CONFIG REQUIRED) @@ -579,16 +584,16 @@ IF (BUILD_BENCHMARKS) target_link_libraries(shogun_benchmark_main benchmark::benchmark shogun-static) elseif(MSVC) target_link_libraries(shogun_benchmark_main - benchmark::benchmark - Shlwapi.lib - shogun-static - ) + benchmark::benchmark + Shlwapi.lib + shogun-static + ) else() target_link_libraries(shogun_benchmark_main - benchmark::benchmark - pthread - shogun-static - ) + benchmark::benchmark + pthread + shogun-static + ) endif() set(SHOGUN_BENCHMARK_LINK_LIBS shogun_benchmark_main) @@ -611,15 +616,15 @@ IF (LICENSE_GPL_SHOGUN) ENDIF() INSTALL( - DIRECTORY ${INCLUDE_HEADERS_DIR_LIST} - DESTINATION ${INCLUDE_INSTALL_DIR} - COMPONENT headers - FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" - PATTERN "CMakeFiles*" EXCLUDE - PATTERN "*.dir" EXCLUDE - PATTERN "*Debug*" EXCLUDE - PATTERN "*Release*" EXCLUDE - PATTERN ".settings" EXCLUDE) + DIRECTORY ${INCLUDE_HEADERS_DIR_LIST} + DESTINATION ${INCLUDE_INSTALL_DIR} + COMPONENT headers + FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" + PATTERN "CMakeFiles*" EXCLUDE + PATTERN "*.dir" EXCLUDE + PATTERN "*Debug*" EXCLUDE + PATTERN "*Release*" EXCLUDE + PATTERN ".settings" EXCLUDE) # set the desidered targets to be installed set(INSTALL_TARGETS shogun) @@ -628,41 +633,41 @@ if (LIBSHOGUN_BUILD_STATIC) endif() INSTALL( - TARGETS ${INSTALL_TARGETS} - EXPORT ShogunTargets - ARCHIVE DESTINATION ${SHOGUN_LIB_INSTALL} - RUNTIME DESTINATION ${SHOGUN_LIB_INSTALL} - LIBRARY DESTINATION ${SHOGUN_LIB_INSTALL} - INCLUDES DESTINATION ${INCLUDE_INSTALL_DIR} + TARGETS ${INSTALL_TARGETS} + EXPORT ShogunTargets + ARCHIVE DESTINATION ${SHOGUN_LIB_INSTALL} + RUNTIME DESTINATION ${SHOGUN_LIB_INSTALL} + LIBRARY DESTINATION ${SHOGUN_LIB_INSTALL} + INCLUDES DESTINATION ${INCLUDE_INSTALL_DIR} ) file(TO_CMAKE_PATH ${SHOGUN_LIB_INSTALL}/cmake/shogun CONFIG_PACKAGE_DIR) configure_package_config_file( - ${CMAKE_SOURCE_DIR}/cmake/ShogunConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake - INSTALL_DESTINATION ${SHOGUN_LIB_INSTALL}/cmake/shogun - PATH_VARS INCLUDE_INSTALL_DIR CONFIG_PACKAGE_DIR) + ${CMAKE_SOURCE_DIR}/cmake/ShogunConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake + INSTALL_DESTINATION ${SHOGUN_LIB_INSTALL}/cmake/shogun + PATH_VARS INCLUDE_INSTALL_DIR CONFIG_PACKAGE_DIR) write_basic_package_version_file( - "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" - VERSION ${VERSION} - COMPATIBILITY SameMajorVersion + "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" + VERSION ${VERSION} + COMPATIBILITY SameMajorVersion ) export(EXPORT ShogunTargets - FILE "${CMAKE_CURRENT_BINARY_DIR}/ShogunTargets.cmake" - NAMESPACE shogun:: -) + FILE "${CMAKE_CURRENT_BINARY_DIR}/ShogunTargets.cmake" + NAMESPACE shogun:: + ) install( - EXPORT ShogunTargets - FILE ShogunTargets.cmake - NAMESPACE shogun:: - DESTINATION ${CONFIG_PACKAGE_DIR} + EXPORT ShogunTargets + FILE ShogunTargets.cmake + NAMESPACE shogun:: + DESTINATION ${CONFIG_PACKAGE_DIR} ) install( - FILES - "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake" - "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" - DESTINATION ${CONFIG_PACKAGE_DIR} -) + FILES + "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/ShogunConfigVersion.cmake" + DESTINATION ${CONFIG_PACKAGE_DIR} +) \ No newline at end of file diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp new file mode 100644 index 00000000000..505869d6789 --- /dev/null +++ b/src/shogun/io/OpenMLFlow.cpp @@ -0,0 +1,187 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#ifdef HAVE_CURL + +using namespace shogun; +using namespace rapidjson; + +size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) +{ + // adapted from https://stackoverflow.com/a/5780603 + // Is there anything in the buffer? + if (buffer_in->empty()) + { + // Append the data to the buffer + buffer_in->append(data, size * nmemb); + + return size * nmemb; + } + + return 0; +} + +const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; +const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::dataset_description = "/data/{}"; +const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; +const char* OpenMLReader::data_features = "/data/features/{}"; +const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; +const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; +const char* OpenMLReader::flow_file = "/flow/{}"; + +const std::unordered_map + OpenMLReader::m_format_options = {{"xml", xml_server}, + {"json", json_server}}; +const std::unordered_map + OpenMLReader::m_request_options = { + {"dataset_description", dataset_description}, + {"list_data_qualities", list_data_qualities}, + {"data_features", data_features}, + {"list_dataset_qualities", list_dataset_qualities}, + {"list_dataset_filter", list_dataset_filter}, + {"flow_file", flow_file}}; + +OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) +{ +} + +void OpenMLReader::openml_curl_request_helper(const std::string& url) +{ + CURL* curl_handle = nullptr; + + curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.") + return; + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); + + CURLcode res = curl_easy_perform(curl_handle); + + openml_curl_error_helper(curl_handle, res); + + curl_easy_cleanup(curl_handle); +} + +void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) +{ + if (code != CURLE_OK) + { + // TODO: call curl_easy_cleanup(curl_handle) ? + SG_SERROR("Curl error: %s\n", curl_easy_strerror(code)) + } +// else +// { +// long response_code; +// curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &response_code); + // if (response_code == 200) + // return; + // else + // { + // if (response_code == 181) + // SG_SERROR("Unknown flow. The flow with the given ID was not + //found in the database.") else if (response_code == 180) SG_SERROR("") + // SG_SERROR("Server code: %d\n", response_code) + // } +// } +} + +std::shared_ptr +OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key) +{ + Document document; + parameters_type params; + components_type components; + std::string name; + std::string description; + std::string class_name; + + // get flow and parse with RapidJSON + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("flow_file", "json", flow_id); + document.Parse(return_string.c_str()); + check_flow_response(document); + + // store root for convenience. We know it exists from previous check. + const Value& root = document["flow"]; + + // handle parameters + if (root.HasMember("parameter")) + { + std::unordered_map param_dict; + + if (root["parameter"].IsArray()) + { + for (const auto &v : root["parameter"].GetArray()) { + emplace_string_to_map(v, param_dict, "data_type"); + emplace_string_to_map(v, param_dict, "default_value"); + emplace_string_to_map(v, param_dict, "description"); + params.emplace(v["name"].GetString(), param_dict); + param_dict.clear(); + } + } + else + { + // parameter can also be a dict, instead of array + const auto v = root["parameter"].GetObject(); + emplace_string_to_map(v, param_dict, "data_type"); + emplace_string_to_map(v, param_dict, "default_value"); + emplace_string_to_map(v, param_dict, "description"); + params.emplace(v["name"].GetString(), param_dict); + } + } + + // handle components, i.e. kernels + if (root.HasMember("component")) + { + for (const auto& v : root["component"].GetArray()) + { + components.emplace( + v["identifier"].GetString(), + OpenMLFlow::download_flow(v["flow"]["id"].GetString(), api_key)); + } + } + + // get remaining information from flow + if (root.HasMember("name")) + name = root["name"].GetString(); + if (root.HasMember("description")) + description = root["description"].GetString(); + if (root.HasMember("class_name")) + class_name = root["class_name"].GetString(); + + auto flow = std::make_shared(name, description, class_name, components, params); + + return flow; +} + +void OpenMLFlow::check_flow_response(Document& doc) +{ + if (SG_UNLIKELY(doc.HasMember("error"))) + { + const Value& root = doc["error"]; + SG_SERROR( + "Server error %s: %s\n", root["code"].GetString(), + root["message"].GetString()) + return; + } + REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); +} + +void OpenMLFlow::upload_flow(const std::shared_ptr& flow) +{ +} + +#endif // HAVE_CURL diff --git a/src/shogun/io/OpenmlFlow.h b/src/shogun/io/OpenMLFlow.h similarity index 59% rename from src/shogun/io/OpenmlFlow.h rename to src/shogun/io/OpenMLFlow.h index 6787df3e252..dee68423269 100644 --- a/src/shogun/io/OpenmlFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -7,16 +7,22 @@ #ifndef SHOGUN_OPENMLFLOW_H #define SHOGUN_OPENMLFLOW_H +#include + #ifdef HAVE_CURL +#include #include #include +#include + +#include +#include #include #include #include #include -#include namespace shogun { @@ -90,10 +96,7 @@ namespace shogun return m_curl_response_buffer; } - void post(const std::string& request, const std::string& data); - private: - std::string m_curl_response_buffer; /** @@ -107,9 +110,10 @@ namespace shogun /** * Handles all possible codes * + * @param curl_handle curl handle used in the request * @param code the code returned by the query */ - void openml_curl_error_helper(CURLcode code); + void openml_curl_error_helper(CURL* curl_handle, CURLcode code); std::string m_api_key; @@ -132,21 +136,85 @@ namespace shogun static const char* flow_file; }; + class OpenMLWritter + { + public: + OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; + + private: + std::string m_api_key; + }; + class OpenMLFlow { public: - explicit OpenMLFlow( - const std::string& api_key, const std::string& flow_id) - : m_api_key(api_key), m_flow_id(flow_id){}; + using components_type = + std::unordered_map>; + using parameters_type = std::unordered_map< + std::string, std::unordered_map>; + + OpenMLFlow( + const std::string& name, const std::string& description, + const std::string& model, components_type components, + parameters_type parameters) + : m_name(name), m_description(description), m_class_name(model), + m_parameters(parameters), m_components(components) + { + } + + ~OpenMLFlow()= default; - void download_flow(); + static std::shared_ptr + download_flow(const std::string& flow_id, const std::string& api_key); - static void upload_flow(const OpenMLFlow& flow); + static void upload_flow(const std::shared_ptr& flow); + + std::shared_ptr get_subflow(const std::string& name) + { + auto find_flow = m_components.find(name); + if (find_flow != m_components.end()) + return find_flow->second; + else + SG_SERROR( + "The provided subflow could not be found in this flow!") + return nullptr; + } private: - std::string m_api_key; - std::string m_flow_id; + std::string m_name; + std::string m_description; + std::string m_class_name; + parameters_type m_parameters; + components_type m_components; + +#ifndef SWIG + static void check_flow_response(rapidjson::Document& doc); + + static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericValue>& v, + std::unordered_map& param_dict, + const std::string& name) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); + } + + static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v, + std::unordered_map& param_dict, + const std::string& name) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); + } + +#endif // SWIG }; } // namespace shogun #endif // HAVE_CURL diff --git a/src/shogun/io/OpenmlFlow.cpp b/src/shogun/io/OpenmlFlow.cpp deleted file mode 100644 index f9cdb19af00..00000000000 --- a/src/shogun/io/OpenmlFlow.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Gil Hoben - */ - -#ifdef HAVE_CURL - -#include -#include "OpenmlFlow.h" - - -using namespace shogun; - -size_t writer(char *data, size_t size, size_t nmemb, std::string* buffer_in) -{ - // adapted from https://stackoverflow.com/a/5780603 - // Is there anything in the buffer? - if (buffer_in->empty()) - { - // Append the data to the buffer - buffer_in->append(data, size * nmemb); - - return size * nmemb; - } - - return 0; -} - -const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; -const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; -const char* OpenMLReader::dataset_description = "/data/{}"; -const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; -const char* OpenMLReader::data_features = "/data/features/{}"; -const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; -const char* OpenMLReader::flow_file = "/flow/{}"; - -const std::unordered_map - OpenMLReader::m_format_options = {{"xml", xml_server}, - {"json", json_server}}; -const std::unordered_map - OpenMLReader::m_request_options = { - - {"dataset_description", dataset_description}, - {"list_data_qualities", list_data_qualities}, - {"data_features", data_features}, - {"list_dataset_qualities", list_dataset_qualities}, - {"list_dataset_filter", list_dataset_filter}, - {"flow_file", flow_file}}; - -OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) -{ -} - -void OpenMLReader::post(const std::string& request, const std::string& data) -{ -} - -void OpenMLReader::openml_curl_request_helper(const std::string& url) -{ - CURL* curl_handle = nullptr; - - curl_handle = curl_easy_init(); - - if (!curl_handle) - { - SG_SERROR("Failed to initialise curl handle.") - return; - } - - curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl_handle, CURLOPT_HTTPGET,1); - curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); - curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); - - CURLcode res = curl_easy_perform(curl_handle); - - openml_curl_error_helper(res); - - curl_easy_cleanup(curl_handle); -} - -void OpenMLReader::openml_curl_error_helper(CURLcode code) { - -} - - -void OpenMLFlow::download_flow() -{ - - auto reader = OpenMLReader(m_api_key); - auto return_string = reader.get("flow_file", "json", m_flow_id); -} - -void OpenMLFlow::upload_flow(const OpenMLFlow& flow) -{ -} - -#endif // HAVE_CURL From 21d2a009ebc88a46d0c2ce59015c5181b5fb665e Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 7 May 2019 16:29:34 +0100 Subject: [PATCH 19/32] moved json dependency to library --- src/shogun/io/OpenMLFlow.cpp | 197 +++++++++++++++++++++++++++++------ src/shogun/io/OpenMLFlow.h | 97 ++++++++++++----- 2 files changed, 235 insertions(+), 59 deletions(-) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index 505869d6789..a7ef3279f97 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -5,6 +5,10 @@ */ #include +#include +#include + +#include #ifdef HAVE_CURL @@ -59,7 +63,7 @@ void OpenMLReader::openml_curl_request_helper(const std::string& url) if (!curl_handle) { - SG_SERROR("Failed to initialise curl handle.") + SG_SERROR("Failed to initialise curl handle.\n") return; } @@ -82,24 +86,61 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) // TODO: call curl_easy_cleanup(curl_handle) ? SG_SERROR("Curl error: %s\n", curl_easy_strerror(code)) } -// else -// { -// long response_code; -// curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &response_code); - // if (response_code == 200) - // return; - // else - // { - // if (response_code == 181) - // SG_SERROR("Unknown flow. The flow with the given ID was not - //found in the database.") else if (response_code == 180) SG_SERROR("") - // SG_SERROR("Server code: %d\n", response_code) - // } -// } + // else + // { + // long response_code; + // curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, + //&response_code); if (response_code == 200) return; + // else + // { + // if (response_code == 181) + // SG_SERROR("Unknown flow. The flow with the given ID was not + // found in the database.") else if (response_code == 180) + // SG_SERROR("") SG_SERROR("Server code: %d\n", response_code) + // } + // } } -std::shared_ptr -OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key) +#endif // HAVE_CURL + +static void check_flow_response(rapidjson::Document& doc) +{ + if (SG_UNLIKELY(doc.HasMember("error"))) + { + const Value& root = doc["error"]; + SG_SERROR( + "Server error %s: %s\n", root["code"].GetString(), + root["message"].GetString()) + return; + } + REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); +} + +static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericValue>& v, + std::unordered_map& param_dict, + const std::string& name) +{ + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); +} + +static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v, + std::unordered_map& param_dict, + const std::string& name) +{ + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); +} + +std::shared_ptr OpenMLFlow::download_flow( + const std::string& flow_id, const std::string& api_key) { Document document; parameters_type params; @@ -124,7 +165,8 @@ OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key if (root["parameter"].IsArray()) { - for (const auto &v : root["parameter"].GetArray()) { + for (const auto& v : root["parameter"].GetArray()) + { emplace_string_to_map(v, param_dict, "data_type"); emplace_string_to_map(v, param_dict, "default_value"); emplace_string_to_map(v, param_dict, "description"); @@ -146,11 +188,22 @@ OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key // handle components, i.e. kernels if (root.HasMember("component")) { - for (const auto& v : root["component"].GetArray()) + if (root["component"].IsArray()) + { + for (const auto& v : root["component"].GetArray()) + { + components.emplace( + v["identifier"].GetString(), + OpenMLFlow::download_flow( + v["flow"]["id"].GetString(), api_key)); + } + } + else { components.emplace( - v["identifier"].GetString(), - OpenMLFlow::download_flow(v["flow"]["id"].GetString(), api_key)); + root["component"]["identifier"].GetString(), + OpenMLFlow::download_flow( + root["component"]["flow"]["id"].GetString(), api_key)); } } @@ -162,26 +215,106 @@ OpenMLFlow::download_flow(const std::string& flow_id, const std::string& api_key if (root.HasMember("class_name")) class_name = root["class_name"].GetString(); - auto flow = std::make_shared(name, description, class_name, components, params); + auto flow = std::make_shared( + name, description, class_name, components, params); return flow; } -void OpenMLFlow::check_flow_response(Document& doc) +void OpenMLFlow::upload_flow(const std::shared_ptr& flow) { - if (SG_UNLIKELY(doc.HasMember("error"))) +} + +void OpenMLFlow::dump() +{ +} + +std::shared_ptr OpenMLFlow::from_file() +{ + return std::shared_ptr(); +} + +std::shared_ptr ShogunOpenML::flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults) +{ + std::string name; + std::string val_as_string; + std::shared_ptr obj; + auto params = flow->get_parameters(); + auto components = flow->get_components(); + auto class_name = get_class_info(flow->get_class_name()); + auto module_name = std::get<0>(class_name); + auto algo_name = std::get<1>(class_name); + if (module_name == "machine") + obj = std::shared_ptr(machine(algo_name)); + else if (module_name == "kernel") + obj = std::shared_ptr(kernel(algo_name)); + else if (module_name == "distance") + obj = std::shared_ptr(distance(algo_name)); + else + SG_SERROR("Unsupported factory \"%s\"\n", module_name.c_str()) + auto obj_param = obj->get_params(); + + auto put_lambda = [&obj, &name, &val_as_string](const auto& val) { + // cast value using type from get, i.e. val + auto val_ = char_to_scalar>( + val_as_string.c_str()); + obj->put(name, val_); + }; + + if (initialize_with_defaults) { - const Value& root = doc["error"]; - SG_SERROR( - "Server error %s: %s\n", root["code"].GetString(), - root["message"].GetString()) - return; + for (const auto& param : params) + { + Any any_val = obj_param.at(param.first)->get_value(); + name = param.first; + val_as_string = param.second.at("default_value"); + sg_any_dispatch(any_val, sg_all_typemap, put_lambda); + } } - REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); + + for (const auto& component : components) + { + CSGObject* a = + flow_to_model(component.second, initialize_with_defaults).get(); + // obj->put(component.first, a); + } + + return obj; } -void OpenMLFlow::upload_flow(const std::shared_ptr& flow) +std::shared_ptr +ShogunOpenML::model_to_flow(const std::shared_ptr& model) { + return std::shared_ptr(); } -#endif // HAVE_CURL +std::tuple +ShogunOpenML::get_class_info(const std::string& class_name) +{ + std::vector class_components; + auto begin = class_name.begin(); + std::tuple result; + + for (auto it = class_name.begin(); it != class_name.end(); ++it) + { + if (*it == '.') + { + class_components.emplace_back(std::string(begin, it)); + begin = std::next(it); + } + if (std::next(it) == class_name.end()) + class_components.emplace_back(std::string(begin, std::next(it))); + } + if (class_components.size() != 3) + SG_SERROR("Invalid class name format %s\n", class_name.c_str()) + if (class_components[0] == "shogun") + result = std::make_tuple(class_components[1], class_components[2]); + else + SG_SERROR( + "The provided flow is not meant for shogun deserialisation! The " + "required library is \"%s\"\n", + class_components[0].c_str()) + + return result; +} diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index dee68423269..8fc46594a08 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -15,7 +15,6 @@ #include #include -#include #include #include @@ -150,9 +149,9 @@ namespace shogun public: using components_type = - std::unordered_map>; + std::unordered_map>; using parameters_type = std::unordered_map< - std::string, std::unordered_map>; + std::string, std::unordered_map>; OpenMLFlow( const std::string& name, const std::string& description, @@ -163,13 +162,15 @@ namespace shogun { } - ~OpenMLFlow()= default; - static std::shared_ptr download_flow(const std::string& flow_id, const std::string& api_key); + static std::shared_ptr from_file(); + static void upload_flow(const std::shared_ptr& flow); + void dump(); + std::shared_ptr get_subflow(const std::string& name) { auto find_flow = m_components.find(name); @@ -181,40 +182,82 @@ namespace shogun return nullptr; } +#ifndef SWIG + SG_FORCED_INLINE parameters_type get_parameters() + { + return m_parameters; + } + + SG_FORCED_INLINE components_type get_components() + { + return m_components; + } + + SG_FORCED_INLINE std::string get_class_name() + { + return m_class_name; + } +#endif // SWIG + private: std::string m_name; std::string m_description; std::string m_class_name; parameters_type m_parameters; components_type m_components; + }; #ifndef SWIG - static void check_flow_response(rapidjson::Document& doc); + template + T char_to_scalar(const char* string_val) + { + SG_SERROR( + "No registered conversion from string to type \"s\"\n", + demangled_type().c_str()) + return 0; + } + + template <> + float32_t char_to_scalar(const char* string_val) + { + char* end; + return std::strtof(string_val, &end); + } - static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericValue>& v, - std::unordered_map& param_dict, - const std::string& name) - { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); - } + template <> + float64_t char_to_scalar(const char* string_val) + { + char* end; + return std::strtod(string_val, &end); + } - static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericObject< - true, rapidjson::GenericValue>>& v, - std::unordered_map& param_dict, - const std::string& name) - { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); - } + template <> + floatmax_t char_to_scalar(const char* string_val) + { + char* end; + return std::strtold(string_val, &end); + } + + template <> + bool char_to_scalar(const char* string_val) + { + return strcmp(string_val, "true"); + } #endif // SWIG + + class ShogunOpenML + { + public: + static std::shared_ptr flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults); + + static std::shared_ptr + model_to_flow(const std::shared_ptr& model); + + private: + static std::tuple + get_class_info(const std::string& class_name); }; } // namespace shogun #endif // HAVE_CURL From 134cf2ecd5779166bebb611ebfa457ae8c497b38 Mon Sep 17 00:00:00 2001 From: gf712 Date: Wed, 8 May 2019 12:25:49 +0100 Subject: [PATCH 20/32] initial ShogunOpenML class --- src/interfaces/swig/IO.i | 2 + src/shogun/io/OpenMLFlow.cpp | 289 +++++++++++++++++++++++++++++------ src/shogun/io/OpenMLFlow.h | 135 +++++++++++----- 3 files changed, 343 insertions(+), 83 deletions(-) diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index 474bba3ba9f..96c22289edb 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -31,6 +31,8 @@ %shared_ptr(shogun::Compressor) %shared_ptr(shogun::OpenMLFlow) +%shared_ptr(shogun::ShogunOpenML::flow_to_model) +%shared_ptr(shogun::ShogunOpenML::model_to_flow) #ifdef USE_BOOL %shared_ptr(shogun::StreamingFileFromSparseFeatures) %shared_ptr(shogun::StreamingFileFromDenseFeatures) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index a7ef3279f97..95a315af945 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -15,6 +15,14 @@ using namespace shogun; using namespace rapidjson; +/** + * The writer callback function used to write the packets to a C++ string. + * @param data the data received in CURL request + * @param size always 1 + * @param nmemb the size of data + * @param buffer_in the buffer to write to + * @return the size of buffer that was written + */ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) { // adapted from https://stackoverflow.com/a/5780603 @@ -30,13 +38,16 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) return 0; } +/* OpenML server format */ const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +/* DATA API */ const char* OpenMLReader::dataset_description = "/data/{}"; const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; const char* OpenMLReader::data_features = "/data/features/{}"; const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; +/* FLOW API */ const char* OpenMLReader::flow_file = "/flow/{}"; const std::unordered_map @@ -84,26 +95,17 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) if (code != CURLE_OK) { // TODO: call curl_easy_cleanup(curl_handle) ? - SG_SERROR("Curl error: %s\n", curl_easy_strerror(code)) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(code)) } - // else - // { - // long response_code; - // curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, - //&response_code); if (response_code == 200) return; - // else - // { - // if (response_code == 181) - // SG_SERROR("Unknown flow. The flow with the given ID was not - // found in the database.") else if (response_code == 180) - // SG_SERROR("") SG_SERROR("Server code: %d\n", response_code) - // } - // } } #endif // HAVE_CURL -static void check_flow_response(rapidjson::Document& doc) +/** + * Checks the returned flow in JSON format + * @param doc the parsed flow + */ +static void check_flow_response(Document& doc) { if (SG_UNLIKELY(doc.HasMember("error"))) { @@ -116,24 +118,36 @@ static void check_flow_response(rapidjson::Document& doc) REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); } +/** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericValue, i.e. string + * @param param_dict the map to write to + * @param name the name of the key + */ static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericValue>& v, + const GenericValue>& v, std::unordered_map& param_dict, const std::string& name) { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + if (v[name.c_str()].GetType() == Type::kStringType) param_dict.emplace(name, v[name.c_str()].GetString()); else param_dict.emplace(name, ""); } +/** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericObject, i.e. array + * @param param_dict the map to write to + * @param name the name of the key + */ static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericObject< - true, rapidjson::GenericValue>>& v, + const GenericObject< + true, GenericValue>>& v, std::unordered_map& param_dict, const std::string& name) { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + if (v[name.c_str()].GetType() == Type::kStringType) param_dict.emplace(name, v[name.c_str()].GetString()); else param_dict.emplace(name, ""); @@ -234,52 +248,235 @@ std::shared_ptr OpenMLFlow::from_file() return std::shared_ptr(); } +/** + * Class using the Any visitor pattern to convert + * a string to a C++ type that can be used as a parameter + * in a Shogun model. + */ +class StringToShogun : public AnyVisitor +{ +public: + explicit StringToShogun(std::shared_ptr model) + : m_model(model), m_parameter(""), m_string_val(""){}; + + StringToShogun( + std::shared_ptr model, const std::string& parameter, + const std::string& string_val) + : m_model(model), m_parameter(parameter), m_string_val(string_val){}; + + void on(bool* v) final + { + if (!is_null()) + { + SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + bool result = strcmp(m_string_val.c_str(), "true") == 0; + m_model->put(m_parameter, result); + } + } + void on(int32_t* v) final + { + if (!is_null()) + { + SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + try + { + int32_t result = std::stoi(m_string_val); + m_model->put(m_parameter, result); + } + catch (const std::invalid_argument&) + { + // it's an option, i.e. internally represented + // as an enum but in swig exposed as a string + m_string_val.erase( + std::remove_if( + m_string_val.begin(), m_string_val.end(), + // remove quotes + [](const auto& val) { return val == '\"'; }), + m_string_val.end()); + m_model->put(m_parameter, m_string_val); + } + } + } + void on(int64_t* v) final + { + if (!is_null()) + { + SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + int64_t result = std::stol(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(float* v) final + { + if (!is_null()) + { + SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + char* end; + float32_t result = std::strtof(m_string_val.c_str(), &end); + m_model->put(m_parameter, result); + } + } + void on(double* v) final + { + if (!is_null()) + { + SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + char* end; + float64_t result = std::strtod(m_string_val.c_str(), &end); + m_model->put(m_parameter, result); + } + } + void on(long double* v) + { + if (!is_null()) + { + SG_SDEBUG("long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + char* end; + floatmax_t result = std::strtold(m_string_val.c_str(), &end); + m_model->put(m_parameter, result); + } + } + void on(CSGObject** v) final + { + SG_SDEBUG("CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + + bool is_null() + { + bool result = strcmp(m_string_val.c_str(), "null") == 0; + return result; + } + + void set_parameter_name(const std::string& name) + { + m_parameter = name; + } + + void set_string_value(const std::string& value) + { + m_string_val = value; + } + +private: + std::shared_ptr m_model; + std::string m_parameter; + std::string m_string_val; +}; + +/** + * Instantiates a CSGObject using a factory + * @param factory_name the name of the factory + * @param algo_name the name of algorithm passed to factory + * @return the instantiated object using a factory + */ +std::shared_ptr instantiate_model_from_factory( + const std::string& factory_name, const std::string& algo_name) +{ + std::shared_ptr obj; + if (factory_name == "machine") + obj = std::shared_ptr(machine(algo_name)); + else if (factory_name == "kernel") + obj = std::shared_ptr(kernel(algo_name)); + else if (factory_name == "distance") + obj = std::shared_ptr(distance(algo_name)); + else + SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + + return obj; +} + +/** + * Downcasts a CSGObject and puts it in the map of obj. + * @param obj the main object + * @param nested_obj the object to be casted and put in the obj map. + * @param parameter_name the name of nested_obj + */ +void cast_and_put( + const std::shared_ptr& obj, + const std::shared_ptr& nested_obj, + const std::string& parameter_name) +{ + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + // TODO: remove clone + // temporary fix until shared_ptr PR merged + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + } + else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + } + else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + } + else + SG_SERROR("Could not cast SGObject.\n") +} + std::shared_ptr ShogunOpenML::flow_to_model( std::shared_ptr flow, bool initialize_with_defaults) { - std::string name; - std::string val_as_string; - std::shared_ptr obj; auto params = flow->get_parameters(); auto components = flow->get_components(); auto class_name = get_class_info(flow->get_class_name()); auto module_name = std::get<0>(class_name); auto algo_name = std::get<1>(class_name); - if (module_name == "machine") - obj = std::shared_ptr(machine(algo_name)); - else if (module_name == "kernel") - obj = std::shared_ptr(kernel(algo_name)); - else if (module_name == "distance") - obj = std::shared_ptr(distance(algo_name)); - else - SG_SERROR("Unsupported factory \"%s\"\n", module_name.c_str()) + + auto obj = instantiate_model_from_factory(module_name, algo_name); auto obj_param = obj->get_params(); - auto put_lambda = [&obj, &name, &val_as_string](const auto& val) { - // cast value using type from get, i.e. val - auto val_ = char_to_scalar>( - val_as_string.c_str()); - obj->put(name, val_); - }; + std::unique_ptr visitor(new StringToShogun(obj)); if (initialize_with_defaults) { for (const auto& param : params) { Any any_val = obj_param.at(param.first)->get_value(); - name = param.first; - val_as_string = param.second.at("default_value"); - sg_any_dispatch(any_val, sg_all_typemap, put_lambda); + std::string name = param.first; + std::string val_as_string = param.second.at("default_value"); + visitor->set_parameter_name(name); + visitor->set_string_value(val_as_string); + any_val.visit(visitor.get()); } } for (const auto& component : components) { - CSGObject* a = - flow_to_model(component.second, initialize_with_defaults).get(); - // obj->put(component.first, a); + std::shared_ptr nested_obj = + flow_to_model(component.second, initialize_with_defaults); + cast_and_put(obj, nested_obj, component.first); } + SG_SDEBUG("Final object: %s.\n", obj->to_string().c_str()); + return obj; } @@ -306,15 +503,15 @@ ShogunOpenML::get_class_info(const std::string& class_name) if (std::next(it) == class_name.end()) class_components.emplace_back(std::string(begin, std::next(it))); } - if (class_components.size() != 3) - SG_SERROR("Invalid class name format %s\n", class_name.c_str()) if (class_components[0] == "shogun") result = std::make_tuple(class_components[1], class_components[2]); else SG_SERROR( "The provided flow is not meant for shogun deserialisation! The " - "required library is \"%s\"\n", + "required library is \"%s\".\n", class_components[0].c_str()) + if (class_components.size() != 3) + SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) return result; } diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 8fc46594a08..8c00ffedb49 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -25,6 +25,9 @@ namespace shogun { + /** + * Reads OpenML streams which can be downloaded with this function. + */ class OpenMLReader { @@ -96,6 +99,7 @@ namespace shogun } private: + /** the raw buffer as a C++ string */ std::string m_curl_response_buffer; /** @@ -114,13 +118,18 @@ namespace shogun */ void openml_curl_error_helper(CURL* curl_handle, CURLcode code); + /** the user API key, not required for all requests */ std::string m_api_key; + /** the server path to get a response in XML format*/ static const char* xml_server; + /** the server path to get a response in JSON format*/ static const char* json_server; + /** the server response format options: XML or JSON */ static const std::unordered_map m_format_options; + /** all the supported server options */ static const std::unordered_map m_request_options; @@ -135,24 +144,48 @@ namespace shogun static const char* flow_file; }; + /** + * Writes OpenML streams to the OpenML server. + */ class OpenMLWritter { public: OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; private: + /** the user API key, likely to be needed to write to OpenML */ std::string m_api_key; }; + /** + * Handles OpenML flows. A flow contains the information + * required to instantiate a model. + */ class OpenMLFlow { public: + /** alias for component type, map of flows */ using components_type = std::unordered_map>; + /** alias for parameter type, map of maps with information specific to a + * parameter */ using parameters_type = std::unordered_map< std::string, std::unordered_map>; + /** + * The OpenMLFlow constructor. This constructor is rarely used by the + * user and is used by the static class members download_flow and + * from_file. The user is expected to use either of the previously + * mentioned functions. + * + * @param name the model name + * @param description the model description + * @param model the flow class_name field + * @param components a map of subflows, i.e. kernels + * @param parameters a map of parameter information, i.e. default values + * for each parameter name + */ OpenMLFlow( const std::string& name, const std::string& description, const std::string& model, components_type components, @@ -162,15 +195,39 @@ namespace shogun { } + /** + * Instantiates a OpenMLFlow by downloaded a flow from the OpenML server. + * + * @param flow_id the flow ID + * @param api_key the user API key (might not be required and can be an empty string) + * @return the OpenMLFlow corresponding to the flow requested + * @throws ShogunException when there is a server error or the requested flow is ill formed. + */ static std::shared_ptr download_flow(const std::string& flow_id, const std::string& api_key); + /** + * Instantiates a OpenMLFlow from a file. + * @return the OpenMLFlow corresponding to the flow requested + */ static std::shared_ptr from_file(); + /** + * Publishes a flow to the OpenML server + * @param flow the flow to be published + */ static void upload_flow(const std::shared_ptr& flow); + /** + * Dumps the OpenMLFlow to disk. + */ void dump(); + /** + * Gets a subflow, i.e. a kernel in a machine + * @param name the name of the subflow, not the flow ID + * @return the subflow if it exists + */ std::shared_ptr get_subflow(const std::string& name) { auto find_flow = m_components.find(name); @@ -200,62 +257,66 @@ namespace shogun #endif // SWIG private: + /** name field of the flow */ std::string m_name; + /** description field of the flow */ std::string m_description; + /** the class_name field of the flow */ std::string m_class_name; + /** the parameter field of the flow (optional) */ parameters_type m_parameters; + /** the components fields of the flow (optional) */ components_type m_components; }; -#ifndef SWIG - template - T char_to_scalar(const char* string_val) - { - SG_SERROR( - "No registered conversion from string to type \"s\"\n", - demangled_type().c_str()) - return 0; - } - - template <> - float32_t char_to_scalar(const char* string_val) - { - char* end; - return std::strtof(string_val, &end); - } - - template <> - float64_t char_to_scalar(const char* string_val) + /** + * Handles OpenML tasks. A task contains all the information + * required to train and test a model. + */ + class OpenMLTask { - char* end; - return std::strtod(string_val, &end); - } - - template <> - floatmax_t char_to_scalar(const char* string_val) - { - char* end; - return std::strtold(string_val, &end); - } - - template <> - bool char_to_scalar(const char* string_val) - { - return strcmp(string_val, "true"); - } - -#endif // SWIG + public: + OpenMLTask(); + }; + /** + * The Shogun OpenML extension to run models from an OpenMLFlow + * and convert models to OpenMLFlow. + */ class ShogunOpenML { public: + /** + * Instantiates a SGObject from an OpenMLFlow. + * + * @param flow the flow to instantiate + * @param initialize_with_defaults whether to use the default values + * specified in the flow + * @return the flow as a trainable model + */ static std::shared_ptr flow_to_model( std::shared_ptr flow, bool initialize_with_defaults); + /** + * Converts a SGObject to an OpenMLFlow. + * + * @param model the model to convert + * @return the flow from the model conversion + */ static std::shared_ptr model_to_flow(const std::shared_ptr& model); private: + /** + * Helper function to extract module/factory information from the class + * name field of OpenMLFlow. Throws an error either if the class name + * field is ill formed (i.e. not library.module.algorithm) or if the + * library name is not "shogun". + * + * @param class_name the flow class_name field + * @return a tuple with the module name (factory string) and the + * algorithm name + */ static std::tuple get_class_info(const std::string& class_name); }; From 07b07d099b380240ffd6f217bd344b8cdaf36731 Mon Sep 17 00:00:00 2001 From: gf712 Date: Wed, 8 May 2019 14:36:36 +0100 Subject: [PATCH 21/32] started working datasets --- src/interfaces/swig/IO.i | 2 - src/shogun/io/OpenMLFlow.cpp | 403 ++++++++++++++++++++++++++++++----- src/shogun/io/OpenMLFlow.h | 168 ++++++++++++++- 3 files changed, 508 insertions(+), 65 deletions(-) diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index 96c22289edb..474bba3ba9f 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -31,8 +31,6 @@ %shared_ptr(shogun::Compressor) %shared_ptr(shogun::OpenMLFlow) -%shared_ptr(shogun::ShogunOpenML::flow_to_model) -%shared_ptr(shogun::ShogunOpenML::model_to_flow) #ifdef USE_BOOL %shared_ptr(shogun::StreamingFileFromSparseFeatures) %shared_ptr(shogun::StreamingFileFromDenseFeatures) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index 95a315af945..b8572e05a31 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -10,11 +10,11 @@ #include -#ifdef HAVE_CURL - using namespace shogun; using namespace rapidjson; +#ifdef HAVE_CURL + /** * The writer callback function used to write the packets to a C++ string. * @param data the data received in CURL request @@ -45,10 +45,13 @@ const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; const char* OpenMLReader::dataset_description = "/data/{}"; const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; const char* OpenMLReader::data_features = "/data/features/{}"; +const char* OpenMLReader::data_qualities = "/data/qualities/{}"; const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; /* FLOW API */ const char* OpenMLReader::flow_file = "/flow/{}"; +/* TASK API */ +const char* OpenMLReader::task_file = "/task/{}"; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, @@ -58,9 +61,11 @@ const std::unordered_map {"dataset_description", dataset_description}, {"list_data_qualities", list_data_qualities}, {"data_features", data_features}, + {"data_qualities", data_qualities}, {"list_dataset_qualities", list_dataset_qualities}, {"list_dataset_filter", list_dataset_filter}, - {"flow_file", flow_file}}; + {"flow_file", flow_file}, + {"task_file", task_file}}; OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) { @@ -102,10 +107,10 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) #endif // HAVE_CURL /** - * Checks the returned flow in JSON format - * @param doc the parsed flow + * Checks the returned response from OpenML in JSON format + * @param doc the parsed OpenML JSON format response */ -static void check_flow_response(Document& doc) +static void check_response(const Document& doc, const std::string& type) { if (SG_UNLIKELY(doc.HasMember("error"))) { @@ -115,7 +120,9 @@ static void check_flow_response(Document& doc) root["message"].GetString()) return; } - REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n"); + REQUIRE( + doc.HasMember(type.c_str()), "Unexpected format of OpenML %s.\n", + type.c_str()); } /** @@ -142,8 +149,7 @@ static SG_FORCED_INLINE void emplace_string_to_map( * @param name the name of the key */ static SG_FORCED_INLINE void emplace_string_to_map( - const GenericObject< - true, GenericValue>>& v, + const GenericObject>>& v, std::unordered_map& param_dict, const std::string& name) { @@ -153,6 +159,55 @@ static SG_FORCED_INLINE void emplace_string_to_map( param_dict.emplace(name, ""); } +template +SG_FORCED_INLINE T return_if_possible( + const std::string& name, + const GenericObject>>& v) +{ + SG_SNOTIMPLEMENTED +} + +template <> +SG_FORCED_INLINE std::string return_if_possible( + const std::string& name, + const GenericObject>>& v) +{ + if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) + return v[name.c_str()].GetString(); + if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) + SG_SERROR("Found member \"%s\" but it is not a string", name.c_str()) + if (!v.HasMember(name.c_str())) + return ""; + SG_SERROR("\"%s\" is not a member of the given object", name.c_str()) + return nullptr; +} + +template <> +SG_FORCED_INLINE std::vector +return_if_possible>( + const std::string& name, + const GenericObject>>& v) +{ + std::vector result; + if (!v.HasMember(name.c_str())) + SG_SERROR("\"%s\" is not a member of the given object", name.c_str()) + if (v[name.c_str()].IsString()) + { + result.emplace_back(v[name.c_str()].GetString()); + } + if (v[name.c_str()].IsArray()) + { + for (const auto& val : v[name.c_str()].GetArray()) + { + if (val.IsString()) + result.emplace_back(val.GetString()); + else + SG_SERROR("Found non string member in \"%s\".\n", name.c_str()) + } + } + return result; +} + std::shared_ptr OpenMLFlow::download_flow( const std::string& flow_id, const std::string& api_key) { @@ -167,7 +222,7 @@ std::shared_ptr OpenMLFlow::download_flow( auto reader = OpenMLReader(api_key); auto return_string = reader.get("flow_file", "json", flow_id); document.Parse(return_string.c_str()); - check_flow_response(document); + check_response(document, "flow"); // store root for convenience. We know it exists from previous check. const Value& root = document["flow"]; @@ -237,21 +292,241 @@ std::shared_ptr OpenMLFlow::download_flow( void OpenMLFlow::upload_flow(const std::shared_ptr& flow) { + SG_SNOTIMPLEMENTED; } void OpenMLFlow::dump() { + SG_SNOTIMPLEMENTED; } std::shared_ptr OpenMLFlow::from_file() { + SG_SNOTIMPLEMENTED; return std::shared_ptr(); } +std::shared_ptr +OpenMLData::get_data(const std::string& id, const std::string& api_key) +{ + // description + Document document; + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("dataset_description", "json", id); + + document.Parse(return_string.c_str()); + check_response(document, "data_set_description"); + + const Value& dataset_description = document["data_set_description"]; + + auto name = return_if_possible( + "name", dataset_description.GetObject()); + auto description = return_if_possible( + "description", dataset_description.GetObject()); + auto data_format = return_if_possible( + "data_format", dataset_description.GetObject()); + auto dataset_id = + return_if_possible("id", dataset_description.GetObject()); + auto version = return_if_possible( + "version", dataset_description.GetObject()); + auto creator = return_if_possible( + "creator", dataset_description.GetObject()); + auto contributor = return_if_possible( + "contributor", dataset_description.GetObject()); + auto collection_date = return_if_possible( + "collection_date", dataset_description.GetObject()); + auto upload_date = return_if_possible( + "upload_date", dataset_description.GetObject()); + auto language = return_if_possible( + "language", dataset_description.GetObject()); + auto licence = return_if_possible( + "licence", dataset_description.GetObject()); + auto url = + return_if_possible("url", dataset_description.GetObject()); + auto default_target_attribute = return_if_possible( + "default_target_attribute", dataset_description.GetObject()); + auto row_id_attribute = return_if_possible( + "row_id_attribute", dataset_description.GetObject()); + auto ignore_attribute = return_if_possible( + "ignore_attribute", dataset_description.GetObject()); + auto version_label = return_if_possible( + "version_label", dataset_description.GetObject()); + auto citation = return_if_possible( + "citation", dataset_description.GetObject()); + auto tags = return_if_possible>( + "tag", dataset_description.GetObject()); + auto visibility = return_if_possible( + "visibility", dataset_description.GetObject()); + auto original_data_url = return_if_possible( + "original_data_url", dataset_description.GetObject()); + auto paper_url = return_if_possible( + "paper_url", dataset_description.GetObject()); + auto update_comment = return_if_possible( + "update_comment", dataset_description.GetObject()); + auto md5_checksum = return_if_possible( + "md5_checksum", dataset_description.GetObject()); + + // features + std::vector> param_vector; + return_string = reader.get("data_features", "json", id); + document.Parse(return_string.c_str()); + check_response(document, "data_features"); + const Value& dataset_features = document["data_features"]; + for (const auto& param : dataset_features.GetArray()) + { + std::unordered_map param_map; + for (const auto& param_descriptors : param.GetObject()) + { + param_map.emplace( + param_descriptors.name.GetString(), + param_descriptors.value.GetString()); + } + param_vector.push_back(param_map); + } + + // qualities + std::vector> qualities_vector; + return_string = reader.get("data_qualities", "json", id); + document.Parse(return_string.c_str()); + check_response(document, "data_qualities"); + const Value& data_qualities = document["data_qualities"]; + for (const auto& param : data_qualities.GetArray()) + { + std::unordered_map param_map; + for (const auto& param_quality : param.GetObject()) + { + param_map.emplace( + param_quality.name.GetString(), + param_quality.value.GetString()); + } + qualities_vector.push_back(param_map); + } + + auto result = std::make_shared( + name, description, data_format, dataset_id, version, creator, + contributor, collection_date, upload_date, language, licence, url, + default_target_attribute, row_id_attribute, ignore_attribute, + version_label, citation, tags, visibility, original_data_url, paper_url, + update_comment, md5_checksum, param_vector, qualities_vector); + + return result; +} + +std::string OpenMLData::get_data_buffer(const std::string& api_key) +{ + SG_SNOTIMPLEMENTED; + return nullptr; +} + +std::shared_ptr +OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) +{ + Document document; + std::string task_name; + std::string task_type_id; + std::shared_ptr openml_dataset; + std::shared_ptr openml_split; + std::pair, std::shared_ptr> + task_descriptor; + + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("task_file", "json", task_id); + + document.Parse(return_string.c_str()); + check_response(document, "task"); + + const Value& root = document["task"]; + + REQUIRE( + task_id == root["task_id"].GetString(), + "Expected downloaded task to have the same id as the requested task " + "id, but got \"%s\", instead of \"%s\".\n", + root["task_id"].GetString(), task_id.c_str()) + + task_name = root["task_name"].GetString(); + OpenMLTask::TaskType task_type = + get_task_from_string(root["task_type"].GetString()); + task_type_id = root["task_type_id"].GetString(); + + // expect two elements in input array: dataset and split + const Value& json_input = root["input"]; + + REQUIRE( + json_input.IsArray(), "Currently the dataset reader can only handle " + "inputs with a dataset and split field.\n") + + auto input_array = json_input.GetArray(); + REQUIRE( + input_array.Size() == 2, + "Currently the dataset reader can only handle inputs with a dataset " + "and split fields. Found %d elements.\n", + input_array.Size()) + + // handle dataset + auto json_dataset = input_array[0].GetObject(); + + if (strcmp(json_dataset["name"].GetString(), "source_data") == 0) + { + auto dataset_info = json_dataset["data_set"].GetObject(); + std::string dataset_id = dataset_info["data_set_id"].GetString(); + std::string target_feature = dataset_info["target_feature"].GetString(); + // openml_dataset = + // std::make_shared(dataset_id, target_feature); + } + else + SG_SERROR("Error parsing the OpenML dataset, could not find the " + "source_data field.\n") + + // handle split + auto json_split = input_array[1].GetObject(); + if (strcmp(json_split["name"].GetString(), "estimation_procedure") == 0) + { + auto split_info = json_dataset["estimation_procedure"].GetObject(); + std::string split_id = split_info["id"].GetString(); + std::string split_type = split_info["type"].GetString(); + std::string split_url = split_info["data_splits_url"].GetString(); + std::unordered_map split_parameters; + for (const auto& param : split_info["parameter"].GetArray()) + { + if (param.Size() == 2) + split_parameters.emplace( + param["name"].GetString(), param["value"].GetString()); + else if (param.Size() == 1) + split_parameters.emplace(param["name"].GetString(), ""); + else + SG_SERROR("Unexpected number of parameters in parameter array " + "of estimation_procedure.\n") + } + openml_split = std::make_shared( + split_id, split_type, split_url, split_parameters); + } + else + SG_SERROR("Error parsing the OpenML dataset, could not find the " + "estimation_procedure field.\n") + + task_descriptor = std::make_pair(openml_dataset, openml_split); + + auto result = std::make_shared( + task_id, task_name, task_type, task_type_id, task_descriptor); + + return result; +} + +OpenMLTask::TaskType +OpenMLTask::get_task_from_string(const std::string& task_type) +{ + if (task_type == "Supervised Classification") + return OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION; + SG_SERROR("OpenMLTask does not supported \"%s\"", task_type.c_str()) +} + /** * Class using the Any visitor pattern to convert * a string to a C++ type that can be used as a parameter - * in a Shogun model. + * in a Shogun model. If the string value is not "null" it will + * be put in its casted type in the given model with the provided parameter + * name. If the value is null nothing happens, i.e. no error is thrown + * and no value is put in model. */ class StringToShogun : public AnyVisitor { @@ -266,18 +541,18 @@ class StringToShogun : public AnyVisitor void on(bool* v) final { + SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) bool result = strcmp(m_string_val.c_str(), "true") == 0; m_model->put(m_parameter, result); } } void on(int32_t* v) final { + SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) try { int32_t result = std::stoi(m_string_val); @@ -299,84 +574,94 @@ class StringToShogun : public AnyVisitor } void on(int64_t* v) final { + SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + int64_t result = std::stol(m_string_val); m_model->put(m_parameter, result); } } void on(float* v) final { + SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - char* end; - float32_t result = std::strtof(m_string_val.c_str(), &end); + float32_t result = std::stof(m_string_val); m_model->put(m_parameter, result); } } void on(double* v) final { + SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - char* end; - float64_t result = std::strtod(m_string_val.c_str(), &end); + float64_t result = std::stod(m_string_val); m_model->put(m_parameter, result); } } void on(long double* v) { + SG_SDEBUG( + "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { - SG_SDEBUG("long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - char* end; - floatmax_t result = std::strtold(m_string_val.c_str(), &end); + floatmax_t result = std::stold(m_string_val); m_model->put(m_parameter, result); } } void on(CSGObject** v) final { - SG_SDEBUG("CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGVector* v) final { - SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGVector* v) final { - SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGVector* v) final { - SG_SDEBUG("SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGMatrix* mat) final { - SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGMatrix* mat) final { - SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - } - void on(SGMatrix* mat) final - { - SG_SDEBUG("SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } - - bool is_null() + void on(SGMatrix* mat) final{SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} + + /** + * In OpenML "null" is an empty parameter value field. + * @return whether the field is "null" + */ + SG_FORCED_INLINE bool is_null() { bool result = strcmp(m_string_val.c_str(), "null") == 0; return result; } - void set_parameter_name(const std::string& name) + SG_FORCED_INLINE void set_parameter_name(const std::string& name) { m_parameter = name; } - void set_string_value(const std::string& value) + SG_FORCED_INLINE void set_string_value(const std::string& value) { m_string_val = value; } @@ -396,17 +681,16 @@ class StringToShogun : public AnyVisitor std::shared_ptr instantiate_model_from_factory( const std::string& factory_name, const std::string& algo_name) { - std::shared_ptr obj; if (factory_name == "machine") - obj = std::shared_ptr(machine(algo_name)); - else if (factory_name == "kernel") - obj = std::shared_ptr(kernel(algo_name)); - else if (factory_name == "distance") - obj = std::shared_ptr(distance(algo_name)); - else - SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + return std::shared_ptr(machine(algo_name)); + if (factory_name == "kernel") + return std::shared_ptr(kernel(algo_name)); + if (factory_name == "distance") + return std::shared_ptr(distance(algo_name)); - return obj; + SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + + return nullptr; } /** @@ -426,19 +710,21 @@ void cast_and_put( // temporary fix until shared_ptr PR merged auto* tmp_clone = dynamic_cast(casted_obj->clone()); obj->put(parameter_name, tmp_clone); + return; } - else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) { auto* tmp_clone = dynamic_cast(casted_obj->clone()); obj->put(parameter_name, tmp_clone); + return; } - else if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) { auto* tmp_clone = dynamic_cast(casted_obj->clone()); obj->put(parameter_name, tmp_clone); + return; } - else - SG_SERROR("Could not cast SGObject.\n") + SG_SERROR("Could not cast SGObject.\n") } std::shared_ptr ShogunOpenML::flow_to_model( @@ -447,8 +733,8 @@ std::shared_ptr ShogunOpenML::flow_to_model( auto params = flow->get_parameters(); auto components = flow->get_components(); auto class_name = get_class_info(flow->get_class_name()); - auto module_name = std::get<0>(class_name); - auto algo_name = std::get<1>(class_name); + auto module_name = class_name.first; + auto algo_name = class_name.second; auto obj = instantiate_model_from_factory(module_name, algo_name); auto obj_param = obj->get_params(); @@ -486,12 +772,12 @@ ShogunOpenML::model_to_flow(const std::shared_ptr& model) return std::shared_ptr(); } -std::tuple +std::pair ShogunOpenML::get_class_info(const std::string& class_name) { std::vector class_components; auto begin = class_name.begin(); - std::tuple result; + std::pair result; for (auto it = class_name.begin(); it != class_name.end(); ++it) { @@ -503,15 +789,16 @@ ShogunOpenML::get_class_info(const std::string& class_name) if (std::next(it) == class_name.end()) class_components.emplace_back(std::string(begin, std::next(it))); } - if (class_components[0] == "shogun") - result = std::make_tuple(class_components[1], class_components[2]); + + if (class_components[0] == "shogun" && class_components.size() == 3) + result = std::make_pair(class_components[1], class_components[2]); + else if (class_components[0] == "shogun" && class_components.size() != 3) + SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) else SG_SERROR( "The provided flow is not meant for shogun deserialisation! The " "required library is \"%s\".\n", class_components[0].c_str()) - if (class_components.size() != 3) - SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) return result; } diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 8c00ffedb49..8c3821332ab 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -21,6 +21,7 @@ #include #include #include +#include #include namespace shogun @@ -50,6 +51,7 @@ namespace shogun std::string get(const std::string& request, const std::string& format, Args... args) { + m_curl_response_buffer.clear(); auto find_format = m_format_options.find(format); if (find_format == m_format_options.end()) { @@ -137,11 +139,15 @@ namespace shogun static const char* dataset_description; static const char* list_data_qualities; static const char* data_features; + static const char* data_qualities; static const char* list_dataset_qualities; static const char* list_dataset_filter; /* FLOW API */ static const char* flow_file; + + /* TASK API */ + static const char* task_file; }; /** @@ -196,12 +202,15 @@ namespace shogun } /** - * Instantiates a OpenMLFlow by downloaded a flow from the OpenML server. + * Instantiates a OpenMLFlow by downloaded a flow from the OpenML + * server. * * @param flow_id the flow ID - * @param api_key the user API key (might not be required and can be an empty string) + * @param api_key the user API key (might not be required and can be an + * empty string) * @return the OpenMLFlow corresponding to the flow requested - * @throws ShogunException when there is a server error or the requested flow is ill formed. + * @throws ShogunException when there is a server error or the requested + * flow is ill formed. */ static std::shared_ptr download_flow(const std::string& flow_id, const std::string& api_key); @@ -269,6 +278,116 @@ namespace shogun components_type m_components; }; + /** + * Handles an OpenML dataset. + */ + class OpenMLData + { + public: + OpenMLData( + const std::string& name, const std::string& description, + const std::string& data_format, const std::string& dataset_id, + const std::string& version, const std::string& creator, + const std::string& contributor, const std::string& collection_date, + const std::string& upload_date, const std::string& language, + const std::string& license, const std::string& url, + const std::string& default_target_attribute, + const std::string& row_id_attribute, + const std::string& ignore_attribute, + const std::string& version_label, const std::string& citation, + std::vector tag, const std::string& visibility, + const std::string& original_data_url, const std::string& paper_url, + const std::string& update_comment, const std::string& md5_checksum, + std::vector> + param_descriptors, + std::vector> + param_qualities) + + : m_name(name), m_description(description), + m_data_format(data_format), m_dataset_id(dataset_id), + m_version(version), m_creator(creator), + m_contributor(contributor), m_collection_date(collection_date), + m_upload_date(upload_date), m_language(language), + m_license(license), m_url(url), + m_default_target_attribute(default_target_attribute), + m_row_id_attribute(row_id_attribute), + m_ignore_attribute(ignore_attribute), + m_version_label(version_label), m_citation(citation), + m_tag(std::move(tag)), m_visibility(visibility), + m_original_data_url(original_data_url), m_paper_url(paper_url), + m_update_comment(update_comment), m_md5_checksum(md5_checksum), + m_param_descriptors(std::move(param_descriptors)), + m_param_qualities(std::move(param_qualities)) + { + } + + /** + * Creates a dataset instance from a given ID. + * + */ + static std::shared_ptr + get_data(const std::string& id, const std::string& api_key); + + /** + * Returns the dataset + * @param api_key + * @return + */ + std::string get_data_buffer(const std::string& api_key); + + private: + std::string m_name; + std::string m_description; + std::string m_data_format; + std::string m_dataset_id; + std::string m_version; + std::string m_creator; + std::string m_contributor; + std::string m_collection_date; + std::string m_upload_date; + std::string m_language; + std::string m_license; + std::string m_url; + std::string m_default_target_attribute; + std::string m_row_id_attribute; + std::string m_ignore_attribute; + std::string m_version_label; + std::string m_citation; + std::vector m_tag; + std::string m_visibility; + std::string m_original_data_url; + std::string m_paper_url; + std::string m_update_comment; + std::string m_md5_checksum; + std::vector> + m_param_descriptors; + std::vector> + m_param_qualities; + }; + + /** + * Handles an OpenML split. + */ + class OpenMLSplit + { + public: + OpenMLSplit( + const std::string& split_id, const std::string& split_type, + const std::string& split_url, + const std::unordered_map& + split_parameters) + : m_split_id(split_id), m_split_type(split_type), + m_split_url(split_url), m_parameters(split_parameters) + { + } + + private: + std::string m_split_id; + std::string m_split_type; + std::string m_split_url; + std::unordered_map m_parameters; + }; + /** * Handles OpenML tasks. A task contains all the information * required to train and test a model. @@ -276,7 +395,46 @@ namespace shogun class OpenMLTask { public: - OpenMLTask(); + enum TaskType + { + SUPERVISED_CLASSIFICATION = 0, + SUPERVISED_REGRESSION = 1, + LEARNING_CURVE = 2, + SUPERVISED_DATASTREAM_CLASSIFICATION = 3, + CLUSTERING = 4, + MACHINE_LEARNING_CHALLENGE = 5, + SURVIVAL_ANALYSIS = 6, + SUBGROUP_DISCOVERY = 7 + }; + OpenMLTask( + const std::string& task_id, const std::string task_name, + TaskType task_type, const std::string& task_type_id, + const std::pair< + std::shared_ptr, std::shared_ptr>& + task_descriptor) + : m_task_id(task_id), m_task_name(task_name), + m_task_type(task_type), m_task_type_id(task_type_id), + m_task_descriptor(task_descriptor) + { + } + + static std::shared_ptr + get_task(const std::string& task_id, const std::string& api_key); + + std::shared_ptr get_dataset() + { + return m_task_descriptor.first; + } + + private: + static TaskType get_task_from_string(const std::string& task_type); + + std::string m_task_id; + std::string m_task_name; + TaskType m_task_type; + std::string m_task_type_id; + std::pair, std::shared_ptr> + m_task_descriptor; }; /** @@ -317,7 +475,7 @@ namespace shogun * @return a tuple with the module name (factory string) and the * algorithm name */ - static std::tuple + static std::pair get_class_info(const std::string& class_name); }; } // namespace shogun From 3cd470b67d08eb4e054f8c0099087ac595ffddfe Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 14 May 2019 13:54:48 +0100 Subject: [PATCH 22/32] started work on splits --- src/shogun/io/OpenMLFlow.cpp | 241 +++++++++++++++++++++++++---------- src/shogun/io/OpenMLFlow.h | 156 +++++++++++++++++++---- 2 files changed, 309 insertions(+), 88 deletions(-) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index b8572e05a31..bb12fc7199e 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -41,6 +41,8 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) /* OpenML server format */ const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; + /* DATA API */ const char* OpenMLReader::dataset_description = "/data/{}"; const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; @@ -52,10 +54,13 @@ const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; const char* OpenMLReader::flow_file = "/flow/{}"; /* TASK API */ const char* OpenMLReader::task_file = "/task/{}"; +/* SPLIT API */ +const char* OpenMLReader::get_split = "/split/{}"; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, - {"json", json_server}}; + {"json", json_server}, + {"split", splits_server}}; const std::unordered_map OpenMLReader::m_request_options = { {"dataset_description", dataset_description}, @@ -104,8 +109,6 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) } } -#endif // HAVE_CURL - /** * Checks the returned response from OpenML in JSON format * @param doc the parsed OpenML JSON format response @@ -367,19 +370,25 @@ OpenMLData::get_data(const std::string& id, const std::string& api_key) "md5_checksum", dataset_description.GetObject()); // features - std::vector> param_vector; + std::vector>> + param_vector; return_string = reader.get("data_features", "json", id); document.Parse(return_string.c_str()); check_response(document, "data_features"); const Value& dataset_features = document["data_features"]; - for (const auto& param : dataset_features.GetArray()) + for (const auto& param : dataset_features["feature"].GetArray()) { - std::unordered_map param_map; + std::unordered_map> param_map; for (const auto& param_descriptors : param.GetObject()) { - param_map.emplace( - param_descriptors.name.GetString(), - param_descriptors.value.GetString()); + std::vector second; + if (param_descriptors.value.IsArray()) + for (const auto& v : param_descriptors.value.GetArray()) + second.emplace_back(v.GetString()); + else + second.emplace_back(param_descriptors.value.GetString()); + + param_map.emplace(param_descriptors.name.GetString(), second); } param_vector.push_back(param_map); } @@ -390,14 +399,17 @@ OpenMLData::get_data(const std::string& id, const std::string& api_key) document.Parse(return_string.c_str()); check_response(document, "data_qualities"); const Value& data_qualities = document["data_qualities"]; - for (const auto& param : data_qualities.GetArray()) + for (const auto& param : data_qualities["quality"].GetArray()) { std::unordered_map param_map; for (const auto& param_quality : param.GetObject()) { - param_map.emplace( - param_quality.name.GetString(), - param_quality.value.GetString()); + if (param_quality.name.IsString() && param_quality.value.IsString()) + param_map.emplace( + param_quality.name.GetString(), + param_quality.value.GetString()); + else if (param_quality.name.IsString()) + param_map.emplace(param_quality.name.GetString(), ""); } qualities_vector.push_back(param_map); } @@ -418,16 +430,28 @@ std::string OpenMLData::get_data_buffer(const std::string& api_key) return nullptr; } +std::shared_ptr +OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) +{ + Document document; + + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("get_split", "split", split_url); + auto return_stream = std::istringstream(return_string); + // add ARFF parsing here + SG_SNOTIMPLEMENTED + return nullptr; +} + std::shared_ptr OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) { Document document; std::string task_name; std::string task_type_id; - std::shared_ptr openml_dataset; - std::shared_ptr openml_split; - std::pair, std::shared_ptr> - task_descriptor; + std::shared_ptr openml_dataset = nullptr; + std::shared_ptr openml_split = nullptr; + std::unordered_map evaluation_measures; auto reader = OpenMLReader(api_key); auto return_string = reader.get("task_file", "json", task_id); @@ -451,63 +475,62 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) // expect two elements in input array: dataset and split const Value& json_input = root["input"]; - REQUIRE( - json_input.IsArray(), "Currently the dataset reader can only handle " - "inputs with a dataset and split field.\n") - auto input_array = json_input.GetArray(); - REQUIRE( - input_array.Size() == 2, - "Currently the dataset reader can only handle inputs with a dataset " - "and split fields. Found %d elements.\n", - input_array.Size()) - - // handle dataset - auto json_dataset = input_array[0].GetObject(); - if (strcmp(json_dataset["name"].GetString(), "source_data") == 0) + for (const auto& task_settings : input_array) { - auto dataset_info = json_dataset["data_set"].GetObject(); - std::string dataset_id = dataset_info["data_set_id"].GetString(); - std::string target_feature = dataset_info["target_feature"].GetString(); - // openml_dataset = - // std::make_shared(dataset_id, target_feature); - } - else - SG_SERROR("Error parsing the OpenML dataset, could not find the " - "source_data field.\n") - - // handle split - auto json_split = input_array[1].GetObject(); - if (strcmp(json_split["name"].GetString(), "estimation_procedure") == 0) - { - auto split_info = json_dataset["estimation_procedure"].GetObject(); - std::string split_id = split_info["id"].GetString(); - std::string split_type = split_info["type"].GetString(); - std::string split_url = split_info["data_splits_url"].GetString(); - std::unordered_map split_parameters; - for (const auto& param : split_info["parameter"].GetArray()) + if (strcmp(task_settings["name"].GetString(), "source_data") == 0) { - if (param.Size() == 2) - split_parameters.emplace( - param["name"].GetString(), param["value"].GetString()); - else if (param.Size() == 1) - split_parameters.emplace(param["name"].GetString(), ""); - else - SG_SERROR("Unexpected number of parameters in parameter array " - "of estimation_procedure.\n") + auto dataset_info = task_settings["data_set"].GetObject(); + std::string dataset_id = dataset_info["data_set_id"].GetString(); + std::string target_feature = + dataset_info["target_feature"].GetString(); + openml_dataset = OpenMLData::get_data(dataset_id, api_key); + } + else if ( + strcmp(task_settings["name"].GetString(), "estimation_procedure") == + 0) + { + auto split_info = task_settings["estimation_procedure"].GetObject(); + std::string split_id = split_info["id"].GetString(); + std::string split_type = split_info["type"].GetString(); + std::string split_url = split_info["data_splits_url"].GetString(); + std::unordered_map split_parameters; + for (const auto& param : split_info["parameter"].GetArray()) + { + if (param.HasMember("name") && param.HasMember("value")) + split_parameters.emplace( + param["name"].GetString(), param["value"].GetString()); + else if (param.HasMember("name")) + split_parameters.emplace(param["name"].GetString(), ""); + else + SG_SERROR( + "Unexpected number of parameters in parameter array " + "of estimation_procedure.\n") + } + openml_split = std::make_shared( + split_id, split_type, split_url, split_parameters); + } + else if ( + strcmp(task_settings["name"].GetString(), "evaluation_measures") == + 0) + { + auto evaluation_info = + task_settings["evaluation_measures"].GetObject(); + for (const auto& param : evaluation_info) + { + evaluation_measures.emplace( + param.name.GetString(), param.value.GetString()); + } } - openml_split = std::make_shared( - split_id, split_type, split_url, split_parameters); } - else - SG_SERROR("Error parsing the OpenML dataset, could not find the " - "estimation_procedure field.\n") - task_descriptor = std::make_pair(openml_dataset, openml_split); + if (openml_dataset == nullptr && openml_split == nullptr) + SG_SERROR("Error parsing task.") auto result = std::make_shared( - task_id, task_name, task_type, task_type_id, task_descriptor); + task_id, task_name, task_type, task_type_id, evaluation_measures, + openml_split, openml_dataset); return result; } @@ -517,7 +540,19 @@ OpenMLTask::get_task_from_string(const std::string& task_type) { if (task_type == "Supervised Classification") return OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION; - SG_SERROR("OpenMLTask does not supported \"%s\"", task_type.c_str()) + SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) +} + +SGMatrix OpenMLTask::get_train_indices() +{ + SG_SNOTIMPLEMENTED + return SGMatrix(); +} + +SGMatrix OpenMLTask::get_test_indices() +{ + SG_SNOTIMPLEMENTED + return SGMatrix(); } /** @@ -802,3 +837,77 @@ ShogunOpenML::get_class_info(const std::string& class_name) return result; } + +CLabels* ShogunOpenML::run_model_on_fold( + const std::shared_ptr& model, + const std::shared_ptr& task, CFeatures* X_train, + index_t repeat_number, index_t fold_number, CLabels* y_train, + CFeatures* X_test) +{ + auto task_type = task->get_task_type(); + auto model_clone = std::shared_ptr(model->clone()); + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + if (auto machine = std::dynamic_pointer_cast(model_clone)) + { + machine->put("labels", y_train); + machine->train(X_train); + return machine->apply(X_test); + } + else + SG_SERROR("The provided model is not trainable!\n") + } + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + return nullptr; +} + +std::shared_ptr OpenMLRun::run_model_on_task( + std::shared_ptr model, std::shared_ptr task) +{ + SG_SNOTIMPLEMENTED + return std::shared_ptr(); +} + +std::shared_ptr OpenMLRun::run_flow_on_task( + std::shared_ptr flow, std::shared_ptr task) +{ + auto data = task->get_dataset(); + SG_SNOTIMPLEMENTED + return std::shared_ptr(); +} + +std::shared_ptr +OpenMLRun::from_filesystem(const std::string& directory) +{ + SG_SNOTIMPLEMENTED + return nullptr; +} + +void OpenMLRun::to_filesystem(const std::string& directory) const +{ + SG_SNOTIMPLEMENTED +} + +void OpenMLRun::publish() const +{ + SG_SNOTIMPLEMENTED +} + +#endif // HAVE_CURL diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 8c3821332ab..2232d6562fd 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -16,7 +16,6 @@ #include -#include #include #include #include @@ -51,6 +50,7 @@ namespace shogun std::string get(const std::string& request, const std::string& format, Args... args) { + std::string request_path; m_curl_response_buffer.clear(); auto find_format = m_format_options.find(format); if (find_format == m_format_options.end()) @@ -59,15 +59,27 @@ namespace shogun "The provided format \"%s\" is not available\n", format.c_str()) } - auto find_request = m_request_options.find(request); - if (find_request == m_request_options.end()) + + if (format == "split") { - SG_SERROR( - "Could not find a way to solve the request \"%s\"\n", - request.c_str()) + REQUIRE( + request == "get_split", + "Split server can only handle \"get_split\" request.\n") + request_path = get_split; } + else + { + auto find_request = m_request_options.find(request); + if (find_request == m_request_options.end()) + { + SG_SERROR( + "Could not find a way to solve the request \"%s\"\n", + request.c_str()) + } + request_path = find_request->second; + } + std::string request_format = find_format->second; - std::string request_path = find_request->second; // get additional args and concatenate them with "/" if (sizeof...(Args) > 0) @@ -127,6 +139,8 @@ namespace shogun static const char* xml_server; /** the server path to get a response in JSON format*/ static const char* json_server; + /** the server path to get a split in ARFF format */ + static const char* splits_server; /** the server response format options: XML or JSON */ static const std::unordered_map @@ -148,6 +162,9 @@ namespace shogun /* TASK API */ static const char* task_file; + + /* SPLIT API */ + static const char* get_split; }; /** @@ -298,7 +315,8 @@ namespace shogun std::vector tag, const std::string& visibility, const std::string& original_data_url, const std::string& paper_url, const std::string& update_comment, const std::string& md5_checksum, - std::vector> + std::vector< + std::unordered_map>> param_descriptors, std::vector> param_qualities) @@ -359,7 +377,7 @@ namespace shogun std::string m_paper_url; std::string m_update_comment; std::string m_md5_checksum; - std::vector> + std::vector>> m_param_descriptors; std::vector> m_param_qualities; @@ -381,6 +399,9 @@ namespace shogun { } + static std::shared_ptr + get_split(const std::string& split_url, const std::string& api_key); + private: std::string m_split_id; std::string m_split_type; @@ -395,7 +416,7 @@ namespace shogun class OpenMLTask { public: - enum TaskType + enum class TaskType { SUPERVISED_CLASSIFICATION = 0, SUPERVISED_REGRESSION = 1, @@ -406,26 +427,49 @@ namespace shogun SURVIVAL_ANALYSIS = 6, SUBGROUP_DISCOVERY = 7 }; + + enum class TaskEvaluation + { + + }; + OpenMLTask( const std::string& task_id, const std::string task_name, TaskType task_type, const std::string& task_type_id, - const std::pair< - std::shared_ptr, std::shared_ptr>& - task_descriptor) + std::unordered_map evaluation_measures, + std::shared_ptr split, + std::shared_ptr data) : m_task_id(task_id), m_task_name(task_name), m_task_type(task_type), m_task_type_id(task_type_id), - m_task_descriptor(task_descriptor) + m_evaluation_measures(evaluation_measures), m_split(split), + m_data(data) { } static std::shared_ptr get_task(const std::string& task_id, const std::string& api_key); - std::shared_ptr get_dataset() + std::shared_ptr get_dataset() const noexcept + { + return m_data; + } + + std::shared_ptr get_split() const noexcept { - return m_task_descriptor.first; + return m_split; } + SGMatrix get_train_indices(); + + SGMatrix get_test_indices(); + +#ifndef SWIG + SG_FORCED_INLINE TaskType get_task_type() const noexcept + { + return m_task_type; + } +#endif // SWIG + private: static TaskType get_task_from_string(const std::string& task_type); @@ -433,8 +477,9 @@ namespace shogun std::string m_task_name; TaskType m_task_type; std::string m_task_type_id; - std::pair, std::shared_ptr> - m_task_descriptor; + std::unordered_map m_evaluation_measures; + std::shared_ptr m_split; + std::shared_ptr m_data; }; /** @@ -464,12 +509,19 @@ namespace shogun static std::shared_ptr model_to_flow(const std::shared_ptr& model); + protected: + CLabels* run_model_on_fold( + const std::shared_ptr& model, + const std::shared_ptr& task, CFeatures* X_train, + index_t repeat_number, index_t fold_number, CLabels* y_train, + CFeatures* X_test); + private: /** - * Helper function to extract module/factory information from the class - * name field of OpenMLFlow. Throws an error either if the class name - * field is ill formed (i.e. not library.module.algorithm) or if the - * library name is not "shogun". + * Helper function to extract module/factory information from the + * class name field of OpenMLFlow. Throws an error either if the + * class name field is ill formed (i.e. not + * library.module.algorithm) or if the library name is not "shogun". * * @param class_name the flow class_name field * @return a tuple with the module name (factory string) and the @@ -478,6 +530,66 @@ namespace shogun static std::pair get_class_info(const std::string& class_name); }; + + class OpenMLRun + { + public: + OpenMLRun( + const std::string& uploader, const std::string& uploader_name, + const std::string& setup_id, const std::string& setup_string, + const std::string& parameter_settings, + std::vector evaluations, + std::vector fold_evaluations, + std::vector sample_evaluations, + const std::string& data_content, + std::vector output_files, + std::shared_ptr task, std::shared_ptr flow, + const std::string& run_id, std::shared_ptr model, + std::vector tags, std::string predictions_url) + : m_uploader(uploader), m_uploader_name(uploader_name), + m_setup_id(setup_id), m_setup_string(setup_string), + m_parameter_settings(parameter_settings), + m_evaluations(std::move(evaluations)), + m_fold_evaluations(std::move(fold_evaluations)), + m_sample_evaluations(std::move(sample_evaluations)), + m_data_content(data_content), + m_output_files(std::move(output_files)), m_task(task), + m_flow(flow), m_run_id(run_id), m_model(model), m_tags(tags), + m_predictions_url(predictions_url) + { + } + + static std::shared_ptr + from_filesystem(const std::string& directory); + + static std::shared_ptr run_flow_on_task( + std::shared_ptr flow, std::shared_ptr task); + + static std::shared_ptr run_model_on_task( + std::shared_ptr model, std::shared_ptr task); + + void to_filesystem(const std::string& directory) const; + + void publish() const; + + private: + std::string m_uploader; + std::string m_uploader_name; + std::string m_setup_id; + std::string m_setup_string; + std::string m_parameter_settings; + std::vector m_evaluations; + std::vector m_fold_evaluations; + std::vector m_sample_evaluations; + std::string m_data_content; + std::vector m_output_files; + std::shared_ptr m_task; + std::shared_ptr m_flow; + std::string m_run_id; + std::shared_ptr m_model; + std::vector m_tags; + std::string m_predictions_url; + }; } // namespace shogun #endif // HAVE_CURL From f1546a6c6217462dab25a694e2b470288b07b688 Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 14 May 2019 17:19:03 +0100 Subject: [PATCH 23/32] removed curl from header file --- src/shogun/io/OpenMLFlow.cpp | 50 ++++++++++++------------------- src/shogun/io/OpenMLFlow.h | 57 ++++++++++++++++-------------------- 2 files changed, 44 insertions(+), 63 deletions(-) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index bb12fc7199e..590711823df 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -5,16 +5,16 @@ */ #include -#include #include #include +#ifdef HAVE_CURL +#include +#endif // HAVE_CURL using namespace shogun; using namespace rapidjson; -#ifdef HAVE_CURL - /** * The writer callback function used to write the packets to a C++ string. * @param data the data received in CURL request @@ -25,16 +25,14 @@ using namespace rapidjson; */ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) { - // adapted from https://stackoverflow.com/a/5780603 - // Is there anything in the buffer? - if (buffer_in->empty()) + // check that the buffer string points to something + if (buffer_in != nullptr) { // Append the data to the buffer buffer_in->append(data, size * nmemb); return size * nmemb; } - return 0; } @@ -55,7 +53,7 @@ const char* OpenMLReader::flow_file = "/flow/{}"; /* TASK API */ const char* OpenMLReader::task_file = "/task/{}"; /* SPLIT API */ -const char* OpenMLReader::get_split = "/split/{}"; +const char* OpenMLReader::get_split = "/get/{}"; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, @@ -72,12 +70,9 @@ const std::unordered_map {"flow_file", flow_file}, {"task_file", task_file}}; -OpenMLReader::OpenMLReader(const std::string& api_key) : m_api_key(api_key) -{ -} - void OpenMLReader::openml_curl_request_helper(const std::string& url) { +#ifdef HAVE_CURL CURL* curl_handle = nullptr; curl_handle = curl_easy_init(); @@ -95,18 +90,11 @@ void OpenMLReader::openml_curl_request_helper(const std::string& url) CURLcode res = curl_easy_perform(curl_handle); - openml_curl_error_helper(curl_handle, res); + if (res != CURLE_OK) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) curl_easy_cleanup(curl_handle); -} - -void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code) -{ - if (code != CURLE_OK) - { - // TODO: call curl_easy_cleanup(curl_handle) ? - SG_SERROR("Connection error: %s.\n", curl_easy_strerror(code)) - } +#endif // HAVE_CURL } /** @@ -298,7 +286,7 @@ void OpenMLFlow::upload_flow(const std::shared_ptr& flow) SG_SNOTIMPLEMENTED; } -void OpenMLFlow::dump() +void OpenMLFlow::dump() const { SG_SNOTIMPLEMENTED; } @@ -543,13 +531,13 @@ OpenMLTask::get_task_from_string(const std::string& task_type) SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) } -SGMatrix OpenMLTask::get_train_indices() +SGMatrix OpenMLTask::get_train_indices() const { SG_SNOTIMPLEMENTED return SGMatrix(); } -SGMatrix OpenMLTask::get_test_indices() +SGMatrix OpenMLTask::get_test_indices() const { SG_SNOTIMPLEMENTED return SGMatrix(); @@ -685,18 +673,18 @@ class StringToShogun : public AnyVisitor * In OpenML "null" is an empty parameter value field. * @return whether the field is "null" */ - SG_FORCED_INLINE bool is_null() + SG_FORCED_INLINE bool is_null() const noexcept { bool result = strcmp(m_string_val.c_str(), "null") == 0; return result; } - SG_FORCED_INLINE void set_parameter_name(const std::string& name) + SG_FORCED_INLINE void set_parameter_name(const std::string& name) noexcept { m_parameter = name; } - SG_FORCED_INLINE void set_string_value(const std::string& value) + SG_FORCED_INLINE void set_string_value(const std::string& value) noexcept { m_string_val = value; } @@ -774,7 +762,7 @@ std::shared_ptr ShogunOpenML::flow_to_model( auto obj = instantiate_model_from_factory(module_name, algo_name); auto obj_param = obj->get_params(); - std::unique_ptr visitor(new StringToShogun(obj)); + auto visitor = std::make_unique(obj); if (initialize_with_defaults) { @@ -859,7 +847,7 @@ CLabels* ShogunOpenML::run_model_on_fold( return machine->apply(X_test); } else - SG_SERROR("The provided model is not trainable!\n") + SG_SERROR("The provided model is not a trainable machine!\n") } break; case OpenMLTask::TaskType::LEARNING_CURVE: @@ -909,5 +897,3 @@ void OpenMLRun::publish() const { SG_SNOTIMPLEMENTED } - -#endif // HAVE_CURL diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 2232d6562fd..45086d44e94 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -9,13 +9,9 @@ #include -#ifdef HAVE_CURL - #include #include -#include - #include #include #include @@ -32,7 +28,9 @@ namespace shogun { public: - explicit OpenMLReader(const std::string& api_key); + explicit OpenMLReader(const std::string& api_key) : m_api_key(api_key) + { + } /** * Returns a string returned by the server given a request. @@ -50,7 +48,9 @@ namespace shogun std::string get(const std::string& request, const std::string& format, Args... args) { +#ifdef HAVE_CURL std::string request_path; + // clear the buffer before request m_curl_response_buffer.clear(); auto find_format = m_format_options.find(format); if (find_format == m_format_options.end()) @@ -63,8 +63,8 @@ namespace shogun if (format == "split") { REQUIRE( - request == "get_split", - "Split server can only handle \"get_split\" request.\n") + request == "get_split", + "Split server can only handle \"get_split\" request.\n") request_path = get_split; } else @@ -73,8 +73,8 @@ namespace shogun if (find_request == m_request_options.end()) { SG_SERROR( - "Could not find a way to solve the request \"%s\"\n", - request.c_str()) + "Could not find a way to solve the request \"%s\"\n", + request.c_str()) } request_path = find_request->second; } @@ -110,6 +110,9 @@ namespace shogun openml_curl_request_helper(url); return m_curl_response_buffer; +#else + SG_SERROR("This function is only available witht the CURL library!\n") +#endif // HAVE_CURL } private: @@ -124,14 +127,6 @@ namespace shogun */ void openml_curl_request_helper(const std::string& url); - /** - * Handles all possible codes - * - * @param curl_handle curl handle used in the request - * @param code the code returned by the query - */ - void openml_curl_error_helper(CURL* curl_handle, CURLcode code); - /** the user API key, not required for all requests */ std::string m_api_key; @@ -214,7 +209,8 @@ namespace shogun const std::string& model, components_type components, parameters_type parameters) : m_name(name), m_description(description), m_class_name(model), - m_parameters(parameters), m_components(components) + m_parameters(std::move(parameters)), + m_components(std::move(components)) { } @@ -247,7 +243,7 @@ namespace shogun /** * Dumps the OpenMLFlow to disk. */ - void dump(); + void dump() const; /** * Gets a subflow, i.e. a kernel in a machine @@ -266,17 +262,17 @@ namespace shogun } #ifndef SWIG - SG_FORCED_INLINE parameters_type get_parameters() + SG_FORCED_INLINE parameters_type get_parameters() const noexcept { return m_parameters; } - SG_FORCED_INLINE components_type get_components() + SG_FORCED_INLINE components_type get_components() const noexcept { return m_components; } - SG_FORCED_INLINE std::string get_class_name() + SG_FORCED_INLINE std::string get_class_name() const noexcept { return m_class_name; } @@ -320,7 +316,6 @@ namespace shogun param_descriptors, std::vector> param_qualities) - : m_name(name), m_description(description), m_data_format(data_format), m_dataset_id(dataset_id), m_version(version), m_creator(creator), @@ -441,8 +436,8 @@ namespace shogun std::shared_ptr data) : m_task_id(task_id), m_task_name(task_name), m_task_type(task_type), m_task_type_id(task_type_id), - m_evaluation_measures(evaluation_measures), m_split(split), - m_data(data) + m_evaluation_measures(std::move(evaluation_measures)), + m_split(std::move(split)), m_data(std::move(data)) { } @@ -459,9 +454,9 @@ namespace shogun return m_split; } - SGMatrix get_train_indices(); + SGMatrix get_train_indices() const; - SGMatrix get_test_indices(); + SGMatrix get_test_indices() const; #ifndef SWIG SG_FORCED_INLINE TaskType get_task_type() const noexcept @@ -553,9 +548,10 @@ namespace shogun m_fold_evaluations(std::move(fold_evaluations)), m_sample_evaluations(std::move(sample_evaluations)), m_data_content(data_content), - m_output_files(std::move(output_files)), m_task(task), - m_flow(flow), m_run_id(run_id), m_model(model), m_tags(tags), - m_predictions_url(predictions_url) + m_output_files(std::move(output_files)), m_task(std::move(task)), + m_flow(std::move(flow)), m_run_id(run_id), + m_model(std::move(model)), m_tags(std::move(tags)), + m_predictions_url(std::move(predictions_url)) { } @@ -591,6 +587,5 @@ namespace shogun std::string m_predictions_url; }; } // namespace shogun -#endif // HAVE_CURL #endif // SHOGUN_OPENMLFLOW_H From d5ac051ba6543934cdb452ef8cf3ff054d7c6fb6 Mon Sep 17 00:00:00 2001 From: gf712 Date: Wed, 15 May 2019 16:21:42 +0100 Subject: [PATCH 24/32] added data getters --- src/shogun/io/OpenMLFlow.cpp | 205 ++++++++++++++++++++++++++++++++--- src/shogun/io/OpenMLFlow.h | 119 ++++++++++++++++---- 2 files changed, 292 insertions(+), 32 deletions(-) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index 590711823df..d2dcc4a56cd 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -6,10 +6,13 @@ #include #include +#include #include #ifdef HAVE_CURL +#include "OpenMLFlow.h" #include + #endif // HAVE_CURL using namespace shogun; @@ -39,6 +42,7 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) /* OpenML server format */ const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::download_server = ""; const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; /* DATA API */ @@ -58,7 +62,8 @@ const char* OpenMLReader::get_split = "/get/{}"; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, {"json", json_server}, - {"split", splits_server}}; + {"split", splits_server}, + {"download", download_server}}; const std::unordered_map OpenMLReader::m_request_options = { {"dataset_description", dataset_description}, @@ -298,7 +303,7 @@ std::shared_ptr OpenMLFlow::from_file() } std::shared_ptr -OpenMLData::get_data(const std::string& id, const std::string& api_key) +OpenMLData::get_dataset(const std::string& id, const std::string& api_key) { // description Document document; @@ -408,27 +413,189 @@ OpenMLData::get_data(const std::string& id, const std::string& api_key) default_target_attribute, row_id_attribute, ignore_attribute, version_label, citation, tags, visibility, original_data_url, paper_url, update_comment, md5_checksum, param_vector, qualities_vector); + result->set_api_key(api_key); + return result; +} +std::shared_ptr OpenMLData::get_features() noexcept +{ + if (!m_cached_features) + get_data(); + return m_cached_features; +} + +std::shared_ptr OpenMLData::get_features(const std::string& label) +{ + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label); + if (find_label == m_feature_names.end()) + SG_SERROR( + "Requested label \"%s\" not in the dataset!\n", label.c_str()) + if (!m_cached_features) + get_data(); + auto col_idx = std::distance(m_feature_names.begin(), find_label); + auto result = std::shared_ptr(m_cached_features->clone()->as()); + if (result->delete_feature_obj(col_idx)) + SG_SERROR("Error deleting the label column in CombinedFeatures!\n") return result; } -std::string OpenMLData::get_data_buffer(const std::string& api_key) +std::shared_ptr OpenMLData::get_labels() { - SG_SNOTIMPLEMENTED; + REQUIRE( + !m_default_target_attribute.empty(), + "A default target attribute is required if no label is given!\n") + return get_labels(m_default_target_attribute); +} + +std::shared_ptr OpenMLData::get_labels(const std::string& label_name) +{ + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label_name); + if (find_label == m_feature_names.end()) + SG_SERROR( + "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) + auto col_idx = std::distance(m_feature_names.begin(), find_label); + + if (!m_cached_features) + get_data(); + + auto target_label_as_feat = + std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); + + // TODO: replace with actual enum values + switch(m_feature_types[col_idx]) + { + // real features + case 0: + { + auto casted_feat = std::dynamic_pointer_cast>(target_label_as_feat); + auto labels_vec = casted_feat->get_feature_vector(0); + auto labels = std::make_shared(); + labels->set_values(labels_vec); + return labels; + } break; + // nominal features + case 1: + { + auto casted_feat = std::dynamic_pointer_cast>(target_label_as_feat); + auto labels_vec = casted_feat->get_feature_vector(0); + auto labels = std::make_shared(); + labels->set_values(labels_vec); + return labels; + } break; + default: + SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) + } + return nullptr; } +void OpenMLData::get_data() +{ + auto reader = OpenMLReader(m_api_key); + auto return_string = reader.get(m_url); + + // TODO: add ARFF parsing and don't forget feature names and feature types + m_cached_features = std::make_shared(); +} + std::shared_ptr OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) { - Document document; - auto reader = OpenMLReader(api_key); auto return_string = reader.get("get_split", "split", split_url); + + if (return_string == "Task not providing datasplits.") + return std::make_shared(); + auto return_stream = std::istringstream(return_string); - // add ARFF parsing here - SG_SNOTIMPLEMENTED - return nullptr; + // TODO: add ARFF parsing here + // get train/test indices + // TODO: replace line below with ARFFDeserialiser::get_features() + auto arff_features = std::make_shared(); + REQUIRE( + arff_features->get_num_feature_obj() == 4, + "Expected a ARFF file with 4 attributes: type, rowid, repeat and " + "fold.\n") + + auto train_test_feat = + std::shared_ptr(arff_features->get_feature_obj(0)); + auto rowid_feat = + std::shared_ptr(arff_features->get_feature_obj(1)); + auto repeat_feat = + std::shared_ptr(arff_features->get_feature_obj(2)); + auto fold_feat = + std::shared_ptr(arff_features->get_feature_obj(3)); + + auto type_vector = string_feature_to_vector(train_test_feat); + auto rowid_vector = dense_feature_to_vector(rowid_feat); + auto repeat_vector = dense_feature_to_vector(repeat_feat); + auto fold_vector = dense_feature_to_vector(fold_feat); + + std::vector> train_idx, test_idx; + for (int i = 0; i < arff_features->get_num_vectors(); ++i) + { + if (type_vector[i] == LabelType::TRAIN) + train_idx.emplace_back(std::initializer_list{ + static_cast(rowid_vector[i]), + static_cast(repeat_vector[i]), + static_cast(fold_vector[i])}); + else + test_idx.emplace_back(std::initializer_list{ + static_cast(rowid_vector[i]), + static_cast(repeat_vector[i]), + static_cast(fold_vector[i])}); + } + + return std::make_shared(train_idx, test_idx); +} + +SGVector +OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = + std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CDenseFeatures!\n>"); + return casted_feat->get_feature_vector(0); +} + +std::vector +OpenMLSplit::string_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CStringFeatures!\n"); + + auto to_lower = [](const std::string& line) { + std::string result; + std::transform( + line.begin(), line.end(), std::back_inserter(result), + [](uint8_t val) { return std::tolower(val); }); + return result; + }; + + std::vector result; + + for (int i = 0; i < casted_feat->get_num_vectors(); ++i) + { + auto row = casted_feat->get_feature_vector(i); + std::string label(1, row[0]); + for (auto j = 1; j < casted_feat->get_max_vector_length(); ++j) + label.append(1, row[j]); + if (to_lower(label) == "train") + result.push_back(LabelType::TRAIN); + else if (to_lower(label) == "test") + result.push_back(LabelType::TEST); + else + SG_SERROR("Unknown label type in split file %s!\n", label.c_str()) + } + return result; } std::shared_ptr @@ -473,7 +640,7 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) std::string dataset_id = dataset_info["data_set_id"].GetString(); std::string target_feature = dataset_info["target_feature"].GetString(); - openml_dataset = OpenMLData::get_data(dataset_id, api_key); + openml_dataset = OpenMLData::get_dataset(dataset_id, api_key); } else if ( strcmp(task_settings["name"].GetString(), "estimation_procedure") == @@ -496,8 +663,11 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) "Unexpected number of parameters in parameter array " "of estimation_procedure.\n") } - openml_split = std::make_shared( - split_id, split_type, split_url, split_parameters); + REQUIRE( + split_type == "crossvalidation", + "Currently only tasks with cross validation are enabled in " + "shogun!\n") + openml_split = OpenMLSplit::get_split(split_url, api_key); } else if ( strcmp(task_settings["name"].GetString(), "evaluation_measures") == @@ -877,7 +1047,16 @@ std::shared_ptr OpenMLRun::run_flow_on_task( std::shared_ptr flow, std::shared_ptr task) { auto data = task->get_dataset(); - SG_SNOTIMPLEMENTED + std::shared_ptr train_features, test_features; + std::shared_ptr train_labels, test_labels; + + if (task->get_split()->contains_splits()) + SG_SNOTIMPLEMENTED + else + { + auto labels = data->get_labels(); + auto feat = data->get_features(); + } return std::shared_ptr(); } diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 45086d44e94..1ae2691be9a 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -48,10 +49,7 @@ namespace shogun std::string get(const std::string& request, const std::string& format, Args... args) { -#ifdef HAVE_CURL std::string request_path; - // clear the buffer before request - m_curl_response_buffer.clear(); auto find_format = m_format_options.find(format); if (find_format == m_format_options.end()) { @@ -107,11 +105,20 @@ namespace shogun std::string url = request_format + request_path + "?" + m_api_key; - openml_curl_request_helper(url); + return get(url); + } + std::string get(const std::string& url) + { +#ifdef HAVE_CURL + // clear the buffer before request + m_curl_response_buffer.clear(); + + openml_curl_request_helper(url); return m_curl_response_buffer; #else - SG_SERROR("This function is only available witht the CURL library!\n") + SG_SERROR( + "This function is only available with the CURL library!\n") #endif // HAVE_CURL } @@ -134,6 +141,8 @@ namespace shogun static const char* xml_server; /** the server path to get a response in JSON format*/ static const char* json_server; + /** the server path to download datasets */ + static const char* download_server; /** the server path to get a split in ARFF format */ static const char* splits_server; @@ -339,16 +348,53 @@ namespace shogun * */ static std::shared_ptr - get_data(const std::string& id, const std::string& api_key); + get_dataset(const std::string& id, const std::string& api_key); /** - * Returns the dataset - * @param api_key + * Returns ALL the features of the dataset, potentially also the labels column + * @return the features + */ + std::shared_ptr get_features() noexcept; + + /** + * Returns the dataset features + * @param label_name the name of the attribute containing the label + * @return the features + */ + std::shared_ptr get_features(const std::string& label_name); + + /** + * Returns the dataset labels if m_default_target_attribute is not empty + * @return the labels + */ + std::shared_ptr get_labels(); + + /** + * Returns the dataset labels given the label_name + * @return the labels + */ + std::shared_ptr get_labels(const std::string& label_name); + + /** + * Returns the type of all attributes/features in the ARFF file * @return */ - std::string get_data_buffer(const std::string& api_key); + // TODO: replace with actual enum values + SG_FORCED_INLINE std::vector get_feature_types() const noexcept + { + return m_feature_types; + } + + protected: + SG_FORCED_INLINE void set_api_key(const std::string& api_key) noexcept + { + m_api_key = api_key; + } private: + + void get_data(); + std::string m_name; std::string m_description; std::string m_data_format; @@ -376,6 +422,11 @@ namespace shogun m_param_descriptors; std::vector> m_param_qualities; + std::string m_api_key; + + std::shared_ptr m_cached_features; + std::vector m_feature_names; + std::vector m_feature_types; // TODO: replace int with type enum }; /** @@ -384,24 +435,54 @@ namespace shogun class OpenMLSplit { public: + enum class LabelType + { + TRAIN = 1, + TEST = 2 + }; + + /** + * Default constructor. This is used when there are no + * train or test indices. + */ + OpenMLSplit() = default; + OpenMLSplit( - const std::string& split_id, const std::string& split_type, - const std::string& split_url, - const std::unordered_map& - split_parameters) - : m_split_id(split_id), m_split_type(split_type), - m_split_url(split_url), m_parameters(split_parameters) + std::vector> train_idx, + std::vector> test_idx) + : m_train_idx(std::move(train_idx)), m_test_idx(std::move(test_idx)) { } static std::shared_ptr get_split(const std::string& split_url, const std::string& api_key); + SG_FORCED_INLINE std::vector> get_train_idx() const + noexcept + { + return m_train_idx; + } + + SG_FORCED_INLINE std::vector> get_test_idx() const + noexcept + { + return m_test_idx; + } + + SG_FORCED_INLINE bool contains_splits() const noexcept + { + return !m_train_idx.empty() && !m_test_idx.empty(); + } + private: - std::string m_split_id; - std::string m_split_type; - std::string m_split_url; - std::unordered_map m_parameters; + static SGVector + dense_feature_to_vector(const std::shared_ptr& feat); + + static std::vector + string_feature_to_vector(const std::shared_ptr& feat); + + std::vector> m_train_idx; + std::vector> m_test_idx; }; /** From df1a8d6df4547c0da27b08329da2b770dadc1f29 Mon Sep 17 00:00:00 2001 From: gf712 Date: Fri, 17 May 2019 10:14:32 +0100 Subject: [PATCH 25/32] get binary classification working [SKIP CI] --- src/shogun/features/CombinedFeatures.cpp | 2 +- src/shogun/io/OpenMLFlow.cpp | 156 +++++++++++++++-------- src/shogun/io/OpenMLFlow.h | 33 +++-- 3 files changed, 129 insertions(+), 62 deletions(-) diff --git a/src/shogun/features/CombinedFeatures.cpp b/src/shogun/features/CombinedFeatures.cpp index 5fedc885f70..9cf72f5748b 100644 --- a/src/shogun/features/CombinedFeatures.cpp +++ b/src/shogun/features/CombinedFeatures.cpp @@ -161,7 +161,7 @@ int32_t CombinedFeatures::get_num_feature_obj() const void CombinedFeatures::init() { SG_ADD(&num_vec, "num_vec", "Number of vectors."); - SG_ADD(&feature_array, "feature_array", "Feature array."); + watch_param("feature_array", &feature_array); } std::shared_ptr CombinedFeatures::create_merged_copy(std::shared_ptr other) const diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index d2dcc4a56cd..73894f73516 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -5,14 +5,12 @@ */ #include -#include #include +#include #include #ifdef HAVE_CURL -#include "OpenMLFlow.h" #include - #endif // HAVE_CURL using namespace shogun; @@ -417,31 +415,54 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) return result; } -std::shared_ptr OpenMLData::get_features() noexcept +std::shared_ptr OpenMLData::get_features() noexcept { if (!m_cached_features) get_data(); return m_cached_features; } -std::shared_ptr OpenMLData::get_features(const std::string& label) +std::shared_ptr OpenMLData::get_features(const std::string& label) { + if (!m_cached_features) + get_data(); auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label); + std::find(m_feature_names.begin(), m_feature_names.end(), label); if (find_label == m_feature_names.end()) - SG_SERROR( - "Requested label \"%s\" not in the dataset!\n", label.c_str()) + SG_SERROR("Requested label \"%s\" not in the dataset!\n", label.c_str()) if (!m_cached_features) get_data(); auto col_idx = std::distance(m_feature_names.begin(), find_label); - auto result = std::shared_ptr(m_cached_features->clone()->as()); - if (result->delete_feature_obj(col_idx)) - SG_SERROR("Error deleting the label column in CombinedFeatures!\n") + auto feat_type_copy = m_feature_types; + feat_type_copy.erase(feat_type_copy.begin() + col_idx); + for (const auto type : feat_type_copy) + { + if (type == ARFFDeserializer::Attribute::STRING) + SG_SERROR("Currently cannot process string features!\n") + } + std::shared_ptr result; + bool first = true; + for (int i = 0; i < m_feature_types.size(); ++i) + { + if (i != col_idx && first) + { + result.reset(m_cached_features->get_feature_obj(i)); + first = false; + } + if (i != col_idx) + result.reset(result->create_merged_copy( + m_cached_features->get_feature_obj(i))); + } + std::dynamic_pointer_cast>(result)->set_num_features(m_feature_types.size()); + std::dynamic_pointer_cast>(result)->set_num_vectors(m_cached_features->get_num_vectors()); + return result; } std::shared_ptr OpenMLData::get_labels() { + if (!m_cached_features) + get_data(); REQUIRE( !m_default_target_attribute.empty(), "A default target attribute is required if no label is given!\n") @@ -450,6 +471,8 @@ std::shared_ptr OpenMLData::get_labels() std::shared_ptr OpenMLData::get_labels(const std::string& label_name) { + if (!m_cached_features) + get_data(); auto find_label = std::find(m_feature_names.begin(), m_feature_names.end(), label_name); if (find_label == m_feature_names.end()) @@ -463,29 +486,38 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) auto target_label_as_feat = std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); - // TODO: replace with actual enum values - switch(m_feature_types[col_idx]) + switch (m_feature_types[col_idx]) { - // real features - case 0: - { - auto casted_feat = std::dynamic_pointer_cast>(target_label_as_feat); - auto labels_vec = casted_feat->get_feature_vector(0); - auto labels = std::make_shared(); - labels->set_values(labels_vec); - return labels; - } break; - // nominal features - case 1: + // real features + case ARFFDeserializer::Attribute::REAL: + case ARFFDeserializer::Attribute::NUMERIC: + case ARFFDeserializer::Attribute::INTEGER: + case ARFFDeserializer::Attribute::DATE: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + auto labels = std::make_shared(labels_vec); + return labels; + } + break; + // nominal features + case ARFFDeserializer::Attribute::NOMINAL: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + for(auto& val: labels_vec) { - auto casted_feat = std::dynamic_pointer_cast>(target_label_as_feat); - auto labels_vec = casted_feat->get_feature_vector(0); - auto labels = std::make_shared(); - labels->set_values(labels_vec); - return labels; - } break; - default: - SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) + if (val == 0) + val = -1; + } + auto labels = std::make_shared(labels_vec); + return labels; + } + break; + default: + SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) } return nullptr; @@ -494,10 +526,14 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) void OpenMLData::get_data() { auto reader = OpenMLReader(m_api_key); - auto return_string = reader.get(m_url); - - // TODO: add ARFF parsing and don't forget feature names and feature types - m_cached_features = std::make_shared(); + std::shared_ptr ss = + std::make_shared(reader.get(m_url)); + + auto parser = ARFFDeserializer(ss); + parser.read(); + m_cached_features = parser.get_features(); + m_feature_names = parser.get_feature_names(); + m_feature_types = parser.get_attribute_types(); } std::shared_ptr @@ -677,14 +713,17 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) task_settings["evaluation_measures"].GetObject(); for (const auto& param : evaluation_info) { - evaluation_measures.emplace( - param.name.GetString(), param.value.GetString()); + if (param.value.IsString()) + evaluation_measures.emplace( + param.name.GetString(), param.value.GetString()); + else + evaluation_measures.emplace(param.name.GetString(), ""); } } } if (openml_dataset == nullptr && openml_split == nullptr) - SG_SERROR("Error parsing task.") + SG_SERROR("Error parsing task.\n") auto result = std::make_shared( task_id, task_name, task_type, task_type_id, evaluation_measures, @@ -996,11 +1035,12 @@ ShogunOpenML::get_class_info(const std::string& class_name) return result; } -CLabels* ShogunOpenML::run_model_on_fold( +std::shared_ptr ShogunOpenML::run_model_on_fold( const std::shared_ptr& model, - const std::shared_ptr& task, CFeatures* X_train, - index_t repeat_number, index_t fold_number, CLabels* y_train, - CFeatures* X_test) + const std::shared_ptr& task, + const std::shared_ptr& X_train, index_t repeat_number, + index_t fold_number, const std::shared_ptr& y_train, + const std::shared_ptr& X_test) { auto task_type = task->get_task_type(); auto model_clone = std::shared_ptr(model->clone()); @@ -1012,9 +1052,14 @@ CLabels* ShogunOpenML::run_model_on_fold( { if (auto machine = std::dynamic_pointer_cast(model_clone)) { - machine->put("labels", y_train); - machine->train(X_train); - return machine->apply(X_test); + machine->put("labels", y_train.get()); + auto tmp = X_train.get(); + machine->train(tmp); + delete tmp; + if (X_test) + return std::shared_ptr(machine->apply(X_test.get())); + else + return std::shared_ptr(machine->apply(X_train.get())); } else SG_SERROR("The provided model is not a trainable machine!\n") @@ -1047,15 +1092,26 @@ std::shared_ptr OpenMLRun::run_flow_on_task( std::shared_ptr flow, std::shared_ptr task) { auto data = task->get_dataset(); - std::shared_ptr train_features, test_features; - std::shared_ptr train_labels, test_labels; + std::shared_ptr train_features, test_features = nullptr; + std::shared_ptr train_labels, test_labels = nullptr; if (task->get_split()->contains_splits()) SG_SNOTIMPLEMENTED else { - auto labels = data->get_labels(); - auto feat = data->get_features(); + train_labels = data->get_labels(); + train_features = + data->get_features(data->get_default_target_attribute()); + auto model = ShogunOpenML::flow_to_model(std::move(flow), true); + + if (auto machine = std::dynamic_pointer_cast(model)) + { + auto result = ShogunOpenML::run_model_on_fold( + machine, task, train_features, 0, 0, train_labels, + test_features); + } + else + SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") } return std::shared_ptr(); } diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h index 1ae2691be9a..28d22fcf0d5 100644 --- a/src/shogun/io/OpenMLFlow.h +++ b/src/shogun/io/OpenMLFlow.h @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -351,17 +352,19 @@ namespace shogun get_dataset(const std::string& id, const std::string& api_key); /** - * Returns ALL the features of the dataset, potentially also the labels column + * Returns ALL the features of the dataset, potentially also the labels + * column * @return the features */ - std::shared_ptr get_features() noexcept; + std::shared_ptr get_features() noexcept; /** * Returns the dataset features * @param label_name the name of the attribute containing the label * @return the features */ - std::shared_ptr get_features(const std::string& label_name); + std::shared_ptr + get_features(const std::string& label_name); /** * Returns the dataset labels if m_default_target_attribute is not empty @@ -379,12 +382,17 @@ namespace shogun * Returns the type of all attributes/features in the ARFF file * @return */ - // TODO: replace with actual enum values - SG_FORCED_INLINE std::vector get_feature_types() const noexcept + SG_FORCED_INLINE std::vector + get_feature_types() const noexcept { return m_feature_types; } + SG_FORCED_INLINE std::string get_default_target_attribute() const noexcept + { + return m_default_target_attribute; + } + protected: SG_FORCED_INLINE void set_api_key(const std::string& api_key) noexcept { @@ -392,7 +400,6 @@ namespace shogun } private: - void get_data(); std::string m_name; @@ -426,7 +433,7 @@ namespace shogun std::shared_ptr m_cached_features; std::vector m_feature_names; - std::vector m_feature_types; // TODO: replace int with type enum + std::vector m_feature_types; }; /** @@ -558,6 +565,7 @@ namespace shogun std::shared_ptr m_data; }; + class OpenMLRun; /** * The Shogun OpenML extension to run models from an OpenMLFlow * and convert models to OpenMLFlow. @@ -565,6 +573,7 @@ namespace shogun class ShogunOpenML { public: + friend class OpenMLRun; /** * Instantiates a SGObject from an OpenMLFlow. * @@ -586,11 +595,13 @@ namespace shogun model_to_flow(const std::shared_ptr& model); protected: - CLabels* run_model_on_fold( + static std::shared_ptr run_model_on_fold( const std::shared_ptr& model, - const std::shared_ptr& task, CFeatures* X_train, - index_t repeat_number, index_t fold_number, CLabels* y_train, - CFeatures* X_test); + const std::shared_ptr& task, + const std::shared_ptr& X_train, + index_t repeat_number, index_t fold_number, + const std::shared_ptr& y_train, + const std::shared_ptr& X_test); private: /** From c615896c5cf4c109d0a06ac2bac96f968912541d Mon Sep 17 00:00:00 2001 From: gf712 Date: Fri, 17 May 2019 16:00:29 +0100 Subject: [PATCH 26/32] smart pointer and refcount fixes --- src/shogun/io/OpenMLFlow.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp index 73894f73516..018351861c1 100644 --- a/src/shogun/io/OpenMLFlow.cpp +++ b/src/shogun/io/OpenMLFlow.cpp @@ -1052,10 +1052,10 @@ std::shared_ptr ShogunOpenML::run_model_on_fold( { if (auto machine = std::dynamic_pointer_cast(model_clone)) { - machine->put("labels", y_train.get()); + // TODO: refactor. more useless clones until smart pointers are merged + machine->put("labels", y_train->clone()->as()); auto tmp = X_train.get(); machine->train(tmp); - delete tmp; if (X_test) return std::shared_ptr(machine->apply(X_test.get())); else @@ -1092,8 +1092,8 @@ std::shared_ptr OpenMLRun::run_flow_on_task( std::shared_ptr flow, std::shared_ptr task) { auto data = task->get_dataset(); - std::shared_ptr train_features, test_features = nullptr; - std::shared_ptr train_labels, test_labels = nullptr; + std::shared_ptr train_features = nullptr, test_features = nullptr; + std::shared_ptr train_labels = nullptr, test_labels = nullptr; if (task->get_split()->contains_splits()) SG_SNOTIMPLEMENTED @@ -1102,6 +1102,9 @@ std::shared_ptr OpenMLRun::run_flow_on_task( train_labels = data->get_labels(); train_features = data->get_features(data->get_default_target_attribute()); + // ensures delete is called by shared ptr destructor + SG_REF(train_labels.get()) + SG_REF(train_features.get()) auto model = ShogunOpenML::flow_to_model(std::move(flow), true); if (auto machine = std::dynamic_pointer_cast(model)) @@ -1109,6 +1112,7 @@ std::shared_ptr OpenMLRun::run_flow_on_task( auto result = ShogunOpenML::run_model_on_fold( machine, task, train_features, 0, 0, train_labels, test_features); + SG_SDEBUG(result->to_string().c_str()); } else SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") From c5c9d93c5a8bfe7a5d35303d96d20b520695d72c Mon Sep 17 00:00:00 2001 From: gf712 Date: Sat, 18 May 2019 11:55:10 +0100 Subject: [PATCH 27/32] split openml classes in different files --- src/interfaces/swig/IO.i | 6 +- src/interfaces/swig/IO_includes.i | 2 + src/shogun/io/OpenMLFlow.cpp | 1138 ------------------------- src/shogun/io/OpenMLFlow.h | 683 --------------- src/shogun/io/openml/OpenMLData.cpp | 255 ++++++ src/shogun/io/openml/OpenMLData.h | 162 ++++ src/shogun/io/openml/OpenMLFlow.cpp | 115 +++ src/shogun/io/openml/OpenMLFlow.h | 142 +++ src/shogun/io/openml/OpenMLReader.cpp | 97 +++ src/shogun/io/openml/OpenMLReader.h | 183 ++++ src/shogun/io/openml/OpenMLRun.cpp | 68 ++ src/shogun/io/openml/OpenMLRun.h | 78 ++ src/shogun/io/openml/OpenMLSplit.cpp | 112 +++ src/shogun/io/openml/OpenMLSplit.h | 71 ++ src/shogun/io/openml/OpenMLTask.cpp | 137 +++ src/shogun/io/openml/OpenMLTask.h | 95 +++ src/shogun/io/openml/ShogunOpenML.cpp | 340 ++++++++ src/shogun/io/openml/ShogunOpenML.h | 70 ++ src/shogun/io/openml/utils.h | 131 +++ 19 files changed, 2063 insertions(+), 1822 deletions(-) delete mode 100644 src/shogun/io/OpenMLFlow.cpp delete mode 100644 src/shogun/io/OpenMLFlow.h create mode 100644 src/shogun/io/openml/OpenMLData.cpp create mode 100644 src/shogun/io/openml/OpenMLData.h create mode 100644 src/shogun/io/openml/OpenMLFlow.cpp create mode 100644 src/shogun/io/openml/OpenMLFlow.h create mode 100644 src/shogun/io/openml/OpenMLReader.cpp create mode 100644 src/shogun/io/openml/OpenMLReader.h create mode 100644 src/shogun/io/openml/OpenMLRun.cpp create mode 100644 src/shogun/io/openml/OpenMLRun.h create mode 100644 src/shogun/io/openml/OpenMLSplit.cpp create mode 100644 src/shogun/io/openml/OpenMLSplit.h create mode 100644 src/shogun/io/openml/OpenMLTask.cpp create mode 100644 src/shogun/io/openml/OpenMLTask.h create mode 100644 src/shogun/io/openml/ShogunOpenML.cpp create mode 100644 src/shogun/io/openml/ShogunOpenML.h create mode 100644 src/shogun/io/openml/utils.h diff --git a/src/interfaces/swig/IO.i b/src/interfaces/swig/IO.i index 474bba3ba9f..647c48cb370 100644 --- a/src/interfaces/swig/IO.i +++ b/src/interfaces/swig/IO.i @@ -31,6 +31,8 @@ %shared_ptr(shogun::Compressor) %shared_ptr(shogun::OpenMLFlow) +%shared_ptr(shogun::OpenMLRun) +%shared_ptr(shogun::OpenMLTask) #ifdef USE_BOOL %shared_ptr(shogun::StreamingFileFromSparseFeatures) %shared_ptr(shogun::StreamingFileFromDenseFeatures) @@ -184,7 +186,9 @@ namespace shogun %include %include -%include +%include +%include +%include %include %include diff --git a/src/interfaces/swig/IO_includes.i b/src/interfaces/swig/IO_includes.i index b5b27ef02f7..8415b314423 100644 --- a/src/interfaces/swig/IO_includes.i +++ b/src/interfaces/swig/IO_includes.i @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/src/shogun/io/OpenMLFlow.cpp b/src/shogun/io/OpenMLFlow.cpp deleted file mode 100644 index 018351861c1..00000000000 --- a/src/shogun/io/OpenMLFlow.cpp +++ /dev/null @@ -1,1138 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Gil Hoben - */ - -#include -#include -#include - -#include -#ifdef HAVE_CURL -#include -#endif // HAVE_CURL - -using namespace shogun; -using namespace rapidjson; - -/** - * The writer callback function used to write the packets to a C++ string. - * @param data the data received in CURL request - * @param size always 1 - * @param nmemb the size of data - * @param buffer_in the buffer to write to - * @return the size of buffer that was written - */ -size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) -{ - // check that the buffer string points to something - if (buffer_in != nullptr) - { - // Append the data to the buffer - buffer_in->append(data, size * nmemb); - - return size * nmemb; - } - return 0; -} - -/* OpenML server format */ -const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; -const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; -const char* OpenMLReader::download_server = ""; -const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; - -/* DATA API */ -const char* OpenMLReader::dataset_description = "/data/{}"; -const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; -const char* OpenMLReader::data_features = "/data/features/{}"; -const char* OpenMLReader::data_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; -/* FLOW API */ -const char* OpenMLReader::flow_file = "/flow/{}"; -/* TASK API */ -const char* OpenMLReader::task_file = "/task/{}"; -/* SPLIT API */ -const char* OpenMLReader::get_split = "/get/{}"; - -const std::unordered_map - OpenMLReader::m_format_options = {{"xml", xml_server}, - {"json", json_server}, - {"split", splits_server}, - {"download", download_server}}; -const std::unordered_map - OpenMLReader::m_request_options = { - {"dataset_description", dataset_description}, - {"list_data_qualities", list_data_qualities}, - {"data_features", data_features}, - {"data_qualities", data_qualities}, - {"list_dataset_qualities", list_dataset_qualities}, - {"list_dataset_filter", list_dataset_filter}, - {"flow_file", flow_file}, - {"task_file", task_file}}; - -void OpenMLReader::openml_curl_request_helper(const std::string& url) -{ -#ifdef HAVE_CURL - CURL* curl_handle = nullptr; - - curl_handle = curl_easy_init(); - - if (!curl_handle) - { - SG_SERROR("Failed to initialise curl handle.\n") - return; - } - - curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); - curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); - curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); - - CURLcode res = curl_easy_perform(curl_handle); - - if (res != CURLE_OK) - SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) - - curl_easy_cleanup(curl_handle); -#endif // HAVE_CURL -} - -/** - * Checks the returned response from OpenML in JSON format - * @param doc the parsed OpenML JSON format response - */ -static void check_response(const Document& doc, const std::string& type) -{ - if (SG_UNLIKELY(doc.HasMember("error"))) - { - const Value& root = doc["error"]; - SG_SERROR( - "Server error %s: %s\n", root["code"].GetString(), - root["message"].GetString()) - return; - } - REQUIRE( - doc.HasMember(type.c_str()), "Unexpected format of OpenML %s.\n", - type.c_str()); -} - -/** - * Helper function to add JSON objects as string in map - * @param v a RapidJSON GenericValue, i.e. string - * @param param_dict the map to write to - * @param name the name of the key - */ -static SG_FORCED_INLINE void emplace_string_to_map( - const GenericValue>& v, - std::unordered_map& param_dict, - const std::string& name) -{ - if (v[name.c_str()].GetType() == Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); -} - -/** - * Helper function to add JSON objects as string in map - * @param v a RapidJSON GenericObject, i.e. array - * @param param_dict the map to write to - * @param name the name of the key - */ -static SG_FORCED_INLINE void emplace_string_to_map( - const GenericObject>>& v, - std::unordered_map& param_dict, - const std::string& name) -{ - if (v[name.c_str()].GetType() == Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); -} - -template -SG_FORCED_INLINE T return_if_possible( - const std::string& name, - const GenericObject>>& v) -{ - SG_SNOTIMPLEMENTED -} - -template <> -SG_FORCED_INLINE std::string return_if_possible( - const std::string& name, - const GenericObject>>& v) -{ - if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) - return v[name.c_str()].GetString(); - if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) - SG_SERROR("Found member \"%s\" but it is not a string", name.c_str()) - if (!v.HasMember(name.c_str())) - return ""; - SG_SERROR("\"%s\" is not a member of the given object", name.c_str()) - return nullptr; -} - -template <> -SG_FORCED_INLINE std::vector -return_if_possible>( - const std::string& name, - const GenericObject>>& v) -{ - std::vector result; - if (!v.HasMember(name.c_str())) - SG_SERROR("\"%s\" is not a member of the given object", name.c_str()) - if (v[name.c_str()].IsString()) - { - result.emplace_back(v[name.c_str()].GetString()); - } - if (v[name.c_str()].IsArray()) - { - for (const auto& val : v[name.c_str()].GetArray()) - { - if (val.IsString()) - result.emplace_back(val.GetString()); - else - SG_SERROR("Found non string member in \"%s\".\n", name.c_str()) - } - } - return result; -} - -std::shared_ptr OpenMLFlow::download_flow( - const std::string& flow_id, const std::string& api_key) -{ - Document document; - parameters_type params; - components_type components; - std::string name; - std::string description; - std::string class_name; - - // get flow and parse with RapidJSON - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("flow_file", "json", flow_id); - document.Parse(return_string.c_str()); - check_response(document, "flow"); - - // store root for convenience. We know it exists from previous check. - const Value& root = document["flow"]; - - // handle parameters - if (root.HasMember("parameter")) - { - std::unordered_map param_dict; - - if (root["parameter"].IsArray()) - { - for (const auto& v : root["parameter"].GetArray()) - { - emplace_string_to_map(v, param_dict, "data_type"); - emplace_string_to_map(v, param_dict, "default_value"); - emplace_string_to_map(v, param_dict, "description"); - params.emplace(v["name"].GetString(), param_dict); - param_dict.clear(); - } - } - else - { - // parameter can also be a dict, instead of array - const auto v = root["parameter"].GetObject(); - emplace_string_to_map(v, param_dict, "data_type"); - emplace_string_to_map(v, param_dict, "default_value"); - emplace_string_to_map(v, param_dict, "description"); - params.emplace(v["name"].GetString(), param_dict); - } - } - - // handle components, i.e. kernels - if (root.HasMember("component")) - { - if (root["component"].IsArray()) - { - for (const auto& v : root["component"].GetArray()) - { - components.emplace( - v["identifier"].GetString(), - OpenMLFlow::download_flow( - v["flow"]["id"].GetString(), api_key)); - } - } - else - { - components.emplace( - root["component"]["identifier"].GetString(), - OpenMLFlow::download_flow( - root["component"]["flow"]["id"].GetString(), api_key)); - } - } - - // get remaining information from flow - if (root.HasMember("name")) - name = root["name"].GetString(); - if (root.HasMember("description")) - description = root["description"].GetString(); - if (root.HasMember("class_name")) - class_name = root["class_name"].GetString(); - - auto flow = std::make_shared( - name, description, class_name, components, params); - - return flow; -} - -void OpenMLFlow::upload_flow(const std::shared_ptr& flow) -{ - SG_SNOTIMPLEMENTED; -} - -void OpenMLFlow::dump() const -{ - SG_SNOTIMPLEMENTED; -} - -std::shared_ptr OpenMLFlow::from_file() -{ - SG_SNOTIMPLEMENTED; - return std::shared_ptr(); -} - -std::shared_ptr -OpenMLData::get_dataset(const std::string& id, const std::string& api_key) -{ - // description - Document document; - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("dataset_description", "json", id); - - document.Parse(return_string.c_str()); - check_response(document, "data_set_description"); - - const Value& dataset_description = document["data_set_description"]; - - auto name = return_if_possible( - "name", dataset_description.GetObject()); - auto description = return_if_possible( - "description", dataset_description.GetObject()); - auto data_format = return_if_possible( - "data_format", dataset_description.GetObject()); - auto dataset_id = - return_if_possible("id", dataset_description.GetObject()); - auto version = return_if_possible( - "version", dataset_description.GetObject()); - auto creator = return_if_possible( - "creator", dataset_description.GetObject()); - auto contributor = return_if_possible( - "contributor", dataset_description.GetObject()); - auto collection_date = return_if_possible( - "collection_date", dataset_description.GetObject()); - auto upload_date = return_if_possible( - "upload_date", dataset_description.GetObject()); - auto language = return_if_possible( - "language", dataset_description.GetObject()); - auto licence = return_if_possible( - "licence", dataset_description.GetObject()); - auto url = - return_if_possible("url", dataset_description.GetObject()); - auto default_target_attribute = return_if_possible( - "default_target_attribute", dataset_description.GetObject()); - auto row_id_attribute = return_if_possible( - "row_id_attribute", dataset_description.GetObject()); - auto ignore_attribute = return_if_possible( - "ignore_attribute", dataset_description.GetObject()); - auto version_label = return_if_possible( - "version_label", dataset_description.GetObject()); - auto citation = return_if_possible( - "citation", dataset_description.GetObject()); - auto tags = return_if_possible>( - "tag", dataset_description.GetObject()); - auto visibility = return_if_possible( - "visibility", dataset_description.GetObject()); - auto original_data_url = return_if_possible( - "original_data_url", dataset_description.GetObject()); - auto paper_url = return_if_possible( - "paper_url", dataset_description.GetObject()); - auto update_comment = return_if_possible( - "update_comment", dataset_description.GetObject()); - auto md5_checksum = return_if_possible( - "md5_checksum", dataset_description.GetObject()); - - // features - std::vector>> - param_vector; - return_string = reader.get("data_features", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_features"); - const Value& dataset_features = document["data_features"]; - for (const auto& param : dataset_features["feature"].GetArray()) - { - std::unordered_map> param_map; - for (const auto& param_descriptors : param.GetObject()) - { - std::vector second; - if (param_descriptors.value.IsArray()) - for (const auto& v : param_descriptors.value.GetArray()) - second.emplace_back(v.GetString()); - else - second.emplace_back(param_descriptors.value.GetString()); - - param_map.emplace(param_descriptors.name.GetString(), second); - } - param_vector.push_back(param_map); - } - - // qualities - std::vector> qualities_vector; - return_string = reader.get("data_qualities", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_qualities"); - const Value& data_qualities = document["data_qualities"]; - for (const auto& param : data_qualities["quality"].GetArray()) - { - std::unordered_map param_map; - for (const auto& param_quality : param.GetObject()) - { - if (param_quality.name.IsString() && param_quality.value.IsString()) - param_map.emplace( - param_quality.name.GetString(), - param_quality.value.GetString()); - else if (param_quality.name.IsString()) - param_map.emplace(param_quality.name.GetString(), ""); - } - qualities_vector.push_back(param_map); - } - - auto result = std::make_shared( - name, description, data_format, dataset_id, version, creator, - contributor, collection_date, upload_date, language, licence, url, - default_target_attribute, row_id_attribute, ignore_attribute, - version_label, citation, tags, visibility, original_data_url, paper_url, - update_comment, md5_checksum, param_vector, qualities_vector); - result->set_api_key(api_key); - return result; -} - -std::shared_ptr OpenMLData::get_features() noexcept -{ - if (!m_cached_features) - get_data(); - return m_cached_features; -} - -std::shared_ptr OpenMLData::get_features(const std::string& label) -{ - if (!m_cached_features) - get_data(); - auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label); - if (find_label == m_feature_names.end()) - SG_SERROR("Requested label \"%s\" not in the dataset!\n", label.c_str()) - if (!m_cached_features) - get_data(); - auto col_idx = std::distance(m_feature_names.begin(), find_label); - auto feat_type_copy = m_feature_types; - feat_type_copy.erase(feat_type_copy.begin() + col_idx); - for (const auto type : feat_type_copy) - { - if (type == ARFFDeserializer::Attribute::STRING) - SG_SERROR("Currently cannot process string features!\n") - } - std::shared_ptr result; - bool first = true; - for (int i = 0; i < m_feature_types.size(); ++i) - { - if (i != col_idx && first) - { - result.reset(m_cached_features->get_feature_obj(i)); - first = false; - } - if (i != col_idx) - result.reset(result->create_merged_copy( - m_cached_features->get_feature_obj(i))); - } - std::dynamic_pointer_cast>(result)->set_num_features(m_feature_types.size()); - std::dynamic_pointer_cast>(result)->set_num_vectors(m_cached_features->get_num_vectors()); - - return result; -} - -std::shared_ptr OpenMLData::get_labels() -{ - if (!m_cached_features) - get_data(); - REQUIRE( - !m_default_target_attribute.empty(), - "A default target attribute is required if no label is given!\n") - return get_labels(m_default_target_attribute); -} - -std::shared_ptr OpenMLData::get_labels(const std::string& label_name) -{ - if (!m_cached_features) - get_data(); - auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label_name); - if (find_label == m_feature_names.end()) - SG_SERROR( - "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) - auto col_idx = std::distance(m_feature_names.begin(), find_label); - - if (!m_cached_features) - get_data(); - - auto target_label_as_feat = - std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); - - switch (m_feature_types[col_idx]) - { - // real features - case ARFFDeserializer::Attribute::REAL: - case ARFFDeserializer::Attribute::NUMERIC: - case ARFFDeserializer::Attribute::INTEGER: - case ARFFDeserializer::Attribute::DATE: - { - auto casted_feat = std::dynamic_pointer_cast>( - target_label_as_feat); - auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); - auto labels = std::make_shared(labels_vec); - return labels; - } - break; - // nominal features - case ARFFDeserializer::Attribute::NOMINAL: - { - auto casted_feat = std::dynamic_pointer_cast>( - target_label_as_feat); - auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); - for(auto& val: labels_vec) - { - if (val == 0) - val = -1; - } - auto labels = std::make_shared(labels_vec); - return labels; - } - break; - default: - SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) - } - - return nullptr; -} - -void OpenMLData::get_data() -{ - auto reader = OpenMLReader(m_api_key); - std::shared_ptr ss = - std::make_shared(reader.get(m_url)); - - auto parser = ARFFDeserializer(ss); - parser.read(); - m_cached_features = parser.get_features(); - m_feature_names = parser.get_feature_names(); - m_feature_types = parser.get_attribute_types(); -} - -std::shared_ptr -OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) -{ - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("get_split", "split", split_url); - - if (return_string == "Task not providing datasplits.") - return std::make_shared(); - - auto return_stream = std::istringstream(return_string); - // TODO: add ARFF parsing here - // get train/test indices - // TODO: replace line below with ARFFDeserialiser::get_features() - auto arff_features = std::make_shared(); - REQUIRE( - arff_features->get_num_feature_obj() == 4, - "Expected a ARFF file with 4 attributes: type, rowid, repeat and " - "fold.\n") - - auto train_test_feat = - std::shared_ptr(arff_features->get_feature_obj(0)); - auto rowid_feat = - std::shared_ptr(arff_features->get_feature_obj(1)); - auto repeat_feat = - std::shared_ptr(arff_features->get_feature_obj(2)); - auto fold_feat = - std::shared_ptr(arff_features->get_feature_obj(3)); - - auto type_vector = string_feature_to_vector(train_test_feat); - auto rowid_vector = dense_feature_to_vector(rowid_feat); - auto repeat_vector = dense_feature_to_vector(repeat_feat); - auto fold_vector = dense_feature_to_vector(fold_feat); - - std::vector> train_idx, test_idx; - for (int i = 0; i < arff_features->get_num_vectors(); ++i) - { - if (type_vector[i] == LabelType::TRAIN) - train_idx.emplace_back(std::initializer_list{ - static_cast(rowid_vector[i]), - static_cast(repeat_vector[i]), - static_cast(fold_vector[i])}); - else - test_idx.emplace_back(std::initializer_list{ - static_cast(rowid_vector[i]), - static_cast(repeat_vector[i]), - static_cast(fold_vector[i])}); - } - - return std::make_shared(train_idx, test_idx); -} - -SGVector -OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) -{ - auto casted_feat = - std::dynamic_pointer_cast>(feat); - // this should never happen - if (!casted_feat) - SG_SERROR("Error casting a column in the split file from CFeatures to " - "CDenseFeatures!\n>"); - return casted_feat->get_feature_vector(0); -} - -std::vector -OpenMLSplit::string_feature_to_vector(const std::shared_ptr& feat) -{ - auto casted_feat = std::dynamic_pointer_cast>(feat); - // this should never happen - if (!casted_feat) - SG_SERROR("Error casting a column in the split file from CFeatures to " - "CStringFeatures!\n"); - - auto to_lower = [](const std::string& line) { - std::string result; - std::transform( - line.begin(), line.end(), std::back_inserter(result), - [](uint8_t val) { return std::tolower(val); }); - return result; - }; - - std::vector result; - - for (int i = 0; i < casted_feat->get_num_vectors(); ++i) - { - auto row = casted_feat->get_feature_vector(i); - std::string label(1, row[0]); - for (auto j = 1; j < casted_feat->get_max_vector_length(); ++j) - label.append(1, row[j]); - if (to_lower(label) == "train") - result.push_back(LabelType::TRAIN); - else if (to_lower(label) == "test") - result.push_back(LabelType::TEST); - else - SG_SERROR("Unknown label type in split file %s!\n", label.c_str()) - } - return result; -} - -std::shared_ptr -OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) -{ - Document document; - std::string task_name; - std::string task_type_id; - std::shared_ptr openml_dataset = nullptr; - std::shared_ptr openml_split = nullptr; - std::unordered_map evaluation_measures; - - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("task_file", "json", task_id); - - document.Parse(return_string.c_str()); - check_response(document, "task"); - - const Value& root = document["task"]; - - REQUIRE( - task_id == root["task_id"].GetString(), - "Expected downloaded task to have the same id as the requested task " - "id, but got \"%s\", instead of \"%s\".\n", - root["task_id"].GetString(), task_id.c_str()) - - task_name = root["task_name"].GetString(); - OpenMLTask::TaskType task_type = - get_task_from_string(root["task_type"].GetString()); - task_type_id = root["task_type_id"].GetString(); - - // expect two elements in input array: dataset and split - const Value& json_input = root["input"]; - - auto input_array = json_input.GetArray(); - - for (const auto& task_settings : input_array) - { - if (strcmp(task_settings["name"].GetString(), "source_data") == 0) - { - auto dataset_info = task_settings["data_set"].GetObject(); - std::string dataset_id = dataset_info["data_set_id"].GetString(); - std::string target_feature = - dataset_info["target_feature"].GetString(); - openml_dataset = OpenMLData::get_dataset(dataset_id, api_key); - } - else if ( - strcmp(task_settings["name"].GetString(), "estimation_procedure") == - 0) - { - auto split_info = task_settings["estimation_procedure"].GetObject(); - std::string split_id = split_info["id"].GetString(); - std::string split_type = split_info["type"].GetString(); - std::string split_url = split_info["data_splits_url"].GetString(); - std::unordered_map split_parameters; - for (const auto& param : split_info["parameter"].GetArray()) - { - if (param.HasMember("name") && param.HasMember("value")) - split_parameters.emplace( - param["name"].GetString(), param["value"].GetString()); - else if (param.HasMember("name")) - split_parameters.emplace(param["name"].GetString(), ""); - else - SG_SERROR( - "Unexpected number of parameters in parameter array " - "of estimation_procedure.\n") - } - REQUIRE( - split_type == "crossvalidation", - "Currently only tasks with cross validation are enabled in " - "shogun!\n") - openml_split = OpenMLSplit::get_split(split_url, api_key); - } - else if ( - strcmp(task_settings["name"].GetString(), "evaluation_measures") == - 0) - { - auto evaluation_info = - task_settings["evaluation_measures"].GetObject(); - for (const auto& param : evaluation_info) - { - if (param.value.IsString()) - evaluation_measures.emplace( - param.name.GetString(), param.value.GetString()); - else - evaluation_measures.emplace(param.name.GetString(), ""); - } - } - } - - if (openml_dataset == nullptr && openml_split == nullptr) - SG_SERROR("Error parsing task.\n") - - auto result = std::make_shared( - task_id, task_name, task_type, task_type_id, evaluation_measures, - openml_split, openml_dataset); - - return result; -} - -OpenMLTask::TaskType -OpenMLTask::get_task_from_string(const std::string& task_type) -{ - if (task_type == "Supervised Classification") - return OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION; - SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) -} - -SGMatrix OpenMLTask::get_train_indices() const -{ - SG_SNOTIMPLEMENTED - return SGMatrix(); -} - -SGMatrix OpenMLTask::get_test_indices() const -{ - SG_SNOTIMPLEMENTED - return SGMatrix(); -} - -/** - * Class using the Any visitor pattern to convert - * a string to a C++ type that can be used as a parameter - * in a Shogun model. If the string value is not "null" it will - * be put in its casted type in the given model with the provided parameter - * name. If the value is null nothing happens, i.e. no error is thrown - * and no value is put in model. - */ -class StringToShogun : public AnyVisitor -{ -public: - explicit StringToShogun(std::shared_ptr model) - : m_model(model), m_parameter(""), m_string_val(""){}; - - StringToShogun( - std::shared_ptr model, const std::string& parameter, - const std::string& string_val) - : m_model(model), m_parameter(parameter), m_string_val(string_val){}; - - void on(bool* v) final - { - SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - bool result = strcmp(m_string_val.c_str(), "true") == 0; - m_model->put(m_parameter, result); - } - } - void on(int32_t* v) final - { - SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - try - { - int32_t result = std::stoi(m_string_val); - m_model->put(m_parameter, result); - } - catch (const std::invalid_argument&) - { - // it's an option, i.e. internally represented - // as an enum but in swig exposed as a string - m_string_val.erase( - std::remove_if( - m_string_val.begin(), m_string_val.end(), - // remove quotes - [](const auto& val) { return val == '\"'; }), - m_string_val.end()); - m_model->put(m_parameter, m_string_val); - } - } - } - void on(int64_t* v) final - { - SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - - int64_t result = std::stol(m_string_val); - m_model->put(m_parameter, result); - } - } - void on(float* v) final - { - SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - float32_t result = std::stof(m_string_val); - m_model->put(m_parameter, result); - } - } - void on(double* v) final - { - SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - float64_t result = std::stod(m_string_val); - m_model->put(m_parameter, result); - } - } - void on(long double* v) - { - SG_SDEBUG( - "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - if (!is_null()) - { - floatmax_t result = std::stold(m_string_val); - m_model->put(m_parameter, result); - } - } - void on(CSGObject** v) final - { - SG_SDEBUG( - "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - } - void on(SGVector* v) final - { - SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - } - void on(SGVector* v) final - { - SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) - } - void on(SGVector* v) final - { - SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) - } - void on(SGMatrix* mat) final - { - SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) - } - void on(SGMatrix* mat) final - { - SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) - } - void on(SGMatrix* mat) final{SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} - - /** - * In OpenML "null" is an empty parameter value field. - * @return whether the field is "null" - */ - SG_FORCED_INLINE bool is_null() const noexcept - { - bool result = strcmp(m_string_val.c_str(), "null") == 0; - return result; - } - - SG_FORCED_INLINE void set_parameter_name(const std::string& name) noexcept - { - m_parameter = name; - } - - SG_FORCED_INLINE void set_string_value(const std::string& value) noexcept - { - m_string_val = value; - } - -private: - std::shared_ptr m_model; - std::string m_parameter; - std::string m_string_val; -}; - -/** - * Instantiates a CSGObject using a factory - * @param factory_name the name of the factory - * @param algo_name the name of algorithm passed to factory - * @return the instantiated object using a factory - */ -std::shared_ptr instantiate_model_from_factory( - const std::string& factory_name, const std::string& algo_name) -{ - if (factory_name == "machine") - return std::shared_ptr(machine(algo_name)); - if (factory_name == "kernel") - return std::shared_ptr(kernel(algo_name)); - if (factory_name == "distance") - return std::shared_ptr(distance(algo_name)); - - SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) - - return nullptr; -} - -/** - * Downcasts a CSGObject and puts it in the map of obj. - * @param obj the main object - * @param nested_obj the object to be casted and put in the obj map. - * @param parameter_name the name of nested_obj - */ -void cast_and_put( - const std::shared_ptr& obj, - const std::shared_ptr& nested_obj, - const std::string& parameter_name) -{ - if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) - { - // TODO: remove clone - // temporary fix until shared_ptr PR merged - auto* tmp_clone = dynamic_cast(casted_obj->clone()); - obj->put(parameter_name, tmp_clone); - return; - } - if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) - { - auto* tmp_clone = dynamic_cast(casted_obj->clone()); - obj->put(parameter_name, tmp_clone); - return; - } - if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) - { - auto* tmp_clone = dynamic_cast(casted_obj->clone()); - obj->put(parameter_name, tmp_clone); - return; - } - SG_SERROR("Could not cast SGObject.\n") -} - -std::shared_ptr ShogunOpenML::flow_to_model( - std::shared_ptr flow, bool initialize_with_defaults) -{ - auto params = flow->get_parameters(); - auto components = flow->get_components(); - auto class_name = get_class_info(flow->get_class_name()); - auto module_name = class_name.first; - auto algo_name = class_name.second; - - auto obj = instantiate_model_from_factory(module_name, algo_name); - auto obj_param = obj->get_params(); - - auto visitor = std::make_unique(obj); - - if (initialize_with_defaults) - { - for (const auto& param : params) - { - Any any_val = obj_param.at(param.first)->get_value(); - std::string name = param.first; - std::string val_as_string = param.second.at("default_value"); - visitor->set_parameter_name(name); - visitor->set_string_value(val_as_string); - any_val.visit(visitor.get()); - } - } - - for (const auto& component : components) - { - std::shared_ptr nested_obj = - flow_to_model(component.second, initialize_with_defaults); - cast_and_put(obj, nested_obj, component.first); - } - - SG_SDEBUG("Final object: %s.\n", obj->to_string().c_str()); - - return obj; -} - -std::shared_ptr -ShogunOpenML::model_to_flow(const std::shared_ptr& model) -{ - return std::shared_ptr(); -} - -std::pair -ShogunOpenML::get_class_info(const std::string& class_name) -{ - std::vector class_components; - auto begin = class_name.begin(); - std::pair result; - - for (auto it = class_name.begin(); it != class_name.end(); ++it) - { - if (*it == '.') - { - class_components.emplace_back(std::string(begin, it)); - begin = std::next(it); - } - if (std::next(it) == class_name.end()) - class_components.emplace_back(std::string(begin, std::next(it))); - } - - if (class_components[0] == "shogun" && class_components.size() == 3) - result = std::make_pair(class_components[1], class_components[2]); - else if (class_components[0] == "shogun" && class_components.size() != 3) - SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) - else - SG_SERROR( - "The provided flow is not meant for shogun deserialisation! The " - "required library is \"%s\".\n", - class_components[0].c_str()) - - return result; -} - -std::shared_ptr ShogunOpenML::run_model_on_fold( - const std::shared_ptr& model, - const std::shared_ptr& task, - const std::shared_ptr& X_train, index_t repeat_number, - index_t fold_number, const std::shared_ptr& y_train, - const std::shared_ptr& X_test) -{ - auto task_type = task->get_task_type(); - auto model_clone = std::shared_ptr(model->clone()); - - switch (task_type) - { - case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: - case OpenMLTask::TaskType::SUPERVISED_REGRESSION: - { - if (auto machine = std::dynamic_pointer_cast(model_clone)) - { - // TODO: refactor. more useless clones until smart pointers are merged - machine->put("labels", y_train->clone()->as()); - auto tmp = X_train.get(); - machine->train(tmp); - if (X_test) - return std::shared_ptr(machine->apply(X_test.get())); - else - return std::shared_ptr(machine->apply(X_train.get())); - } - else - SG_SERROR("The provided model is not a trainable machine!\n") - } - break; - case OpenMLTask::TaskType::LEARNING_CURVE: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::CLUSTERING: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: - SG_SNOTIMPLEMENTED - } - return nullptr; -} - -std::shared_ptr OpenMLRun::run_model_on_task( - std::shared_ptr model, std::shared_ptr task) -{ - SG_SNOTIMPLEMENTED - return std::shared_ptr(); -} - -std::shared_ptr OpenMLRun::run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task) -{ - auto data = task->get_dataset(); - std::shared_ptr train_features = nullptr, test_features = nullptr; - std::shared_ptr train_labels = nullptr, test_labels = nullptr; - - if (task->get_split()->contains_splits()) - SG_SNOTIMPLEMENTED - else - { - train_labels = data->get_labels(); - train_features = - data->get_features(data->get_default_target_attribute()); - // ensures delete is called by shared ptr destructor - SG_REF(train_labels.get()) - SG_REF(train_features.get()) - auto model = ShogunOpenML::flow_to_model(std::move(flow), true); - - if (auto machine = std::dynamic_pointer_cast(model)) - { - auto result = ShogunOpenML::run_model_on_fold( - machine, task, train_features, 0, 0, train_labels, - test_features); - SG_SDEBUG(result->to_string().c_str()); - } - else - SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") - } - return std::shared_ptr(); -} - -std::shared_ptr -OpenMLRun::from_filesystem(const std::string& directory) -{ - SG_SNOTIMPLEMENTED - return nullptr; -} - -void OpenMLRun::to_filesystem(const std::string& directory) const -{ - SG_SNOTIMPLEMENTED -} - -void OpenMLRun::publish() const -{ - SG_SNOTIMPLEMENTED -} diff --git a/src/shogun/io/OpenMLFlow.h b/src/shogun/io/OpenMLFlow.h deleted file mode 100644 index 28d22fcf0d5..00000000000 --- a/src/shogun/io/OpenMLFlow.h +++ /dev/null @@ -1,683 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Gil Hoben - */ - -#ifndef SHOGUN_OPENMLFLOW_H -#define SHOGUN_OPENMLFLOW_H - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace shogun -{ - /** - * Reads OpenML streams which can be downloaded with this function. - */ - class OpenMLReader - { - - public: - explicit OpenMLReader(const std::string& api_key) : m_api_key(api_key) - { - } - - /** - * Returns a string returned by the server given a request. - * Raises an error if the returned code is not 200. - * Additional arguments can be passed to the request, - * which are then concatenated with a "/" character. - * - * @tparam Args argument type pack, should all be std::string - * @param request the request name, see m_request_options - * @param format the format to return the data in, see m_format_options - * @param args the additional arguments to be passed to request - * @return the returned stream from the server if the return code is 200 - */ - template - std::string - get(const std::string& request, const std::string& format, Args... args) - { - std::string request_path; - auto find_format = m_format_options.find(format); - if (find_format == m_format_options.end()) - { - SG_SERROR( - "The provided format \"%s\" is not available\n", - format.c_str()) - } - - if (format == "split") - { - REQUIRE( - request == "get_split", - "Split server can only handle \"get_split\" request.\n") - request_path = get_split; - } - else - { - auto find_request = m_request_options.find(request); - if (find_request == m_request_options.end()) - { - SG_SERROR( - "Could not find a way to solve the request \"%s\"\n", - request.c_str()) - } - request_path = find_request->second; - } - - std::string request_format = find_format->second; - - // get additional args and concatenate them with "/" - if (sizeof...(Args) > 0) - { - if (request_path.substr(request_path.size() - 2) == "{}") - { - request_path = - request_path.substr(0, request_path.size() - 2); - } - else - { - SG_SERROR( - "The provided request \"%s\" cannot handle additional " - "args.\n", - request.c_str()) - } - std::vector args_vec = {args...}; - std::string args_string = std::accumulate( - args_vec.begin() + 1, args_vec.end(), args_vec.front(), - [](std::string s0, std::string& s1) { - return s0 += "/" + s1; - }); - request_path += args_string; - } - - std::string url = request_format + request_path + "?" + m_api_key; - - return get(url); - } - - std::string get(const std::string& url) - { -#ifdef HAVE_CURL - // clear the buffer before request - m_curl_response_buffer.clear(); - - openml_curl_request_helper(url); - return m_curl_response_buffer; -#else - SG_SERROR( - "This function is only available with the CURL library!\n") -#endif // HAVE_CURL - } - - private: - /** the raw buffer as a C++ string */ - std::string m_curl_response_buffer; - - /** - * Initialises CURL session and gets the data. - * This function also handles the response code from the server. - * - * @param url the url to query - */ - void openml_curl_request_helper(const std::string& url); - - /** the user API key, not required for all requests */ - std::string m_api_key; - - /** the server path to get a response in XML format*/ - static const char* xml_server; - /** the server path to get a response in JSON format*/ - static const char* json_server; - /** the server path to download datasets */ - static const char* download_server; - /** the server path to get a split in ARFF format */ - static const char* splits_server; - - /** the server response format options: XML or JSON */ - static const std::unordered_map - m_format_options; - /** all the supported server options */ - static const std::unordered_map - m_request_options; - - /* DATA API */ - static const char* dataset_description; - static const char* list_data_qualities; - static const char* data_features; - static const char* data_qualities; - static const char* list_dataset_qualities; - static const char* list_dataset_filter; - - /* FLOW API */ - static const char* flow_file; - - /* TASK API */ - static const char* task_file; - - /* SPLIT API */ - static const char* get_split; - }; - - /** - * Writes OpenML streams to the OpenML server. - */ - class OpenMLWritter - { - public: - OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; - - private: - /** the user API key, likely to be needed to write to OpenML */ - std::string m_api_key; - }; - - /** - * Handles OpenML flows. A flow contains the information - * required to instantiate a model. - */ - class OpenMLFlow - { - - public: - /** alias for component type, map of flows */ - using components_type = - std::unordered_map>; - /** alias for parameter type, map of maps with information specific to a - * parameter */ - using parameters_type = std::unordered_map< - std::string, std::unordered_map>; - - /** - * The OpenMLFlow constructor. This constructor is rarely used by the - * user and is used by the static class members download_flow and - * from_file. The user is expected to use either of the previously - * mentioned functions. - * - * @param name the model name - * @param description the model description - * @param model the flow class_name field - * @param components a map of subflows, i.e. kernels - * @param parameters a map of parameter information, i.e. default values - * for each parameter name - */ - OpenMLFlow( - const std::string& name, const std::string& description, - const std::string& model, components_type components, - parameters_type parameters) - : m_name(name), m_description(description), m_class_name(model), - m_parameters(std::move(parameters)), - m_components(std::move(components)) - { - } - - /** - * Instantiates a OpenMLFlow by downloaded a flow from the OpenML - * server. - * - * @param flow_id the flow ID - * @param api_key the user API key (might not be required and can be an - * empty string) - * @return the OpenMLFlow corresponding to the flow requested - * @throws ShogunException when there is a server error or the requested - * flow is ill formed. - */ - static std::shared_ptr - download_flow(const std::string& flow_id, const std::string& api_key); - - /** - * Instantiates a OpenMLFlow from a file. - * @return the OpenMLFlow corresponding to the flow requested - */ - static std::shared_ptr from_file(); - - /** - * Publishes a flow to the OpenML server - * @param flow the flow to be published - */ - static void upload_flow(const std::shared_ptr& flow); - - /** - * Dumps the OpenMLFlow to disk. - */ - void dump() const; - - /** - * Gets a subflow, i.e. a kernel in a machine - * @param name the name of the subflow, not the flow ID - * @return the subflow if it exists - */ - std::shared_ptr get_subflow(const std::string& name) - { - auto find_flow = m_components.find(name); - if (find_flow != m_components.end()) - return find_flow->second; - else - SG_SERROR( - "The provided subflow could not be found in this flow!") - return nullptr; - } - -#ifndef SWIG - SG_FORCED_INLINE parameters_type get_parameters() const noexcept - { - return m_parameters; - } - - SG_FORCED_INLINE components_type get_components() const noexcept - { - return m_components; - } - - SG_FORCED_INLINE std::string get_class_name() const noexcept - { - return m_class_name; - } -#endif // SWIG - - private: - /** name field of the flow */ - std::string m_name; - /** description field of the flow */ - std::string m_description; - /** the class_name field of the flow */ - std::string m_class_name; - /** the parameter field of the flow (optional) */ - parameters_type m_parameters; - /** the components fields of the flow (optional) */ - components_type m_components; - }; - - /** - * Handles an OpenML dataset. - */ - class OpenMLData - { - public: - OpenMLData( - const std::string& name, const std::string& description, - const std::string& data_format, const std::string& dataset_id, - const std::string& version, const std::string& creator, - const std::string& contributor, const std::string& collection_date, - const std::string& upload_date, const std::string& language, - const std::string& license, const std::string& url, - const std::string& default_target_attribute, - const std::string& row_id_attribute, - const std::string& ignore_attribute, - const std::string& version_label, const std::string& citation, - std::vector tag, const std::string& visibility, - const std::string& original_data_url, const std::string& paper_url, - const std::string& update_comment, const std::string& md5_checksum, - std::vector< - std::unordered_map>> - param_descriptors, - std::vector> - param_qualities) - : m_name(name), m_description(description), - m_data_format(data_format), m_dataset_id(dataset_id), - m_version(version), m_creator(creator), - m_contributor(contributor), m_collection_date(collection_date), - m_upload_date(upload_date), m_language(language), - m_license(license), m_url(url), - m_default_target_attribute(default_target_attribute), - m_row_id_attribute(row_id_attribute), - m_ignore_attribute(ignore_attribute), - m_version_label(version_label), m_citation(citation), - m_tag(std::move(tag)), m_visibility(visibility), - m_original_data_url(original_data_url), m_paper_url(paper_url), - m_update_comment(update_comment), m_md5_checksum(md5_checksum), - m_param_descriptors(std::move(param_descriptors)), - m_param_qualities(std::move(param_qualities)) - { - } - - /** - * Creates a dataset instance from a given ID. - * - */ - static std::shared_ptr - get_dataset(const std::string& id, const std::string& api_key); - - /** - * Returns ALL the features of the dataset, potentially also the labels - * column - * @return the features - */ - std::shared_ptr get_features() noexcept; - - /** - * Returns the dataset features - * @param label_name the name of the attribute containing the label - * @return the features - */ - std::shared_ptr - get_features(const std::string& label_name); - - /** - * Returns the dataset labels if m_default_target_attribute is not empty - * @return the labels - */ - std::shared_ptr get_labels(); - - /** - * Returns the dataset labels given the label_name - * @return the labels - */ - std::shared_ptr get_labels(const std::string& label_name); - - /** - * Returns the type of all attributes/features in the ARFF file - * @return - */ - SG_FORCED_INLINE std::vector - get_feature_types() const noexcept - { - return m_feature_types; - } - - SG_FORCED_INLINE std::string get_default_target_attribute() const noexcept - { - return m_default_target_attribute; - } - - protected: - SG_FORCED_INLINE void set_api_key(const std::string& api_key) noexcept - { - m_api_key = api_key; - } - - private: - void get_data(); - - std::string m_name; - std::string m_description; - std::string m_data_format; - std::string m_dataset_id; - std::string m_version; - std::string m_creator; - std::string m_contributor; - std::string m_collection_date; - std::string m_upload_date; - std::string m_language; - std::string m_license; - std::string m_url; - std::string m_default_target_attribute; - std::string m_row_id_attribute; - std::string m_ignore_attribute; - std::string m_version_label; - std::string m_citation; - std::vector m_tag; - std::string m_visibility; - std::string m_original_data_url; - std::string m_paper_url; - std::string m_update_comment; - std::string m_md5_checksum; - std::vector>> - m_param_descriptors; - std::vector> - m_param_qualities; - std::string m_api_key; - - std::shared_ptr m_cached_features; - std::vector m_feature_names; - std::vector m_feature_types; - }; - - /** - * Handles an OpenML split. - */ - class OpenMLSplit - { - public: - enum class LabelType - { - TRAIN = 1, - TEST = 2 - }; - - /** - * Default constructor. This is used when there are no - * train or test indices. - */ - OpenMLSplit() = default; - - OpenMLSplit( - std::vector> train_idx, - std::vector> test_idx) - : m_train_idx(std::move(train_idx)), m_test_idx(std::move(test_idx)) - { - } - - static std::shared_ptr - get_split(const std::string& split_url, const std::string& api_key); - - SG_FORCED_INLINE std::vector> get_train_idx() const - noexcept - { - return m_train_idx; - } - - SG_FORCED_INLINE std::vector> get_test_idx() const - noexcept - { - return m_test_idx; - } - - SG_FORCED_INLINE bool contains_splits() const noexcept - { - return !m_train_idx.empty() && !m_test_idx.empty(); - } - - private: - static SGVector - dense_feature_to_vector(const std::shared_ptr& feat); - - static std::vector - string_feature_to_vector(const std::shared_ptr& feat); - - std::vector> m_train_idx; - std::vector> m_test_idx; - }; - - /** - * Handles OpenML tasks. A task contains all the information - * required to train and test a model. - */ - class OpenMLTask - { - public: - enum class TaskType - { - SUPERVISED_CLASSIFICATION = 0, - SUPERVISED_REGRESSION = 1, - LEARNING_CURVE = 2, - SUPERVISED_DATASTREAM_CLASSIFICATION = 3, - CLUSTERING = 4, - MACHINE_LEARNING_CHALLENGE = 5, - SURVIVAL_ANALYSIS = 6, - SUBGROUP_DISCOVERY = 7 - }; - - enum class TaskEvaluation - { - - }; - - OpenMLTask( - const std::string& task_id, const std::string task_name, - TaskType task_type, const std::string& task_type_id, - std::unordered_map evaluation_measures, - std::shared_ptr split, - std::shared_ptr data) - : m_task_id(task_id), m_task_name(task_name), - m_task_type(task_type), m_task_type_id(task_type_id), - m_evaluation_measures(std::move(evaluation_measures)), - m_split(std::move(split)), m_data(std::move(data)) - { - } - - static std::shared_ptr - get_task(const std::string& task_id, const std::string& api_key); - - std::shared_ptr get_dataset() const noexcept - { - return m_data; - } - - std::shared_ptr get_split() const noexcept - { - return m_split; - } - - SGMatrix get_train_indices() const; - - SGMatrix get_test_indices() const; - -#ifndef SWIG - SG_FORCED_INLINE TaskType get_task_type() const noexcept - { - return m_task_type; - } -#endif // SWIG - - private: - static TaskType get_task_from_string(const std::string& task_type); - - std::string m_task_id; - std::string m_task_name; - TaskType m_task_type; - std::string m_task_type_id; - std::unordered_map m_evaluation_measures; - std::shared_ptr m_split; - std::shared_ptr m_data; - }; - - class OpenMLRun; - /** - * The Shogun OpenML extension to run models from an OpenMLFlow - * and convert models to OpenMLFlow. - */ - class ShogunOpenML - { - public: - friend class OpenMLRun; - /** - * Instantiates a SGObject from an OpenMLFlow. - * - * @param flow the flow to instantiate - * @param initialize_with_defaults whether to use the default values - * specified in the flow - * @return the flow as a trainable model - */ - static std::shared_ptr flow_to_model( - std::shared_ptr flow, bool initialize_with_defaults); - - /** - * Converts a SGObject to an OpenMLFlow. - * - * @param model the model to convert - * @return the flow from the model conversion - */ - static std::shared_ptr - model_to_flow(const std::shared_ptr& model); - - protected: - static std::shared_ptr run_model_on_fold( - const std::shared_ptr& model, - const std::shared_ptr& task, - const std::shared_ptr& X_train, - index_t repeat_number, index_t fold_number, - const std::shared_ptr& y_train, - const std::shared_ptr& X_test); - - private: - /** - * Helper function to extract module/factory information from the - * class name field of OpenMLFlow. Throws an error either if the - * class name field is ill formed (i.e. not - * library.module.algorithm) or if the library name is not "shogun". - * - * @param class_name the flow class_name field - * @return a tuple with the module name (factory string) and the - * algorithm name - */ - static std::pair - get_class_info(const std::string& class_name); - }; - - class OpenMLRun - { - public: - OpenMLRun( - const std::string& uploader, const std::string& uploader_name, - const std::string& setup_id, const std::string& setup_string, - const std::string& parameter_settings, - std::vector evaluations, - std::vector fold_evaluations, - std::vector sample_evaluations, - const std::string& data_content, - std::vector output_files, - std::shared_ptr task, std::shared_ptr flow, - const std::string& run_id, std::shared_ptr model, - std::vector tags, std::string predictions_url) - : m_uploader(uploader), m_uploader_name(uploader_name), - m_setup_id(setup_id), m_setup_string(setup_string), - m_parameter_settings(parameter_settings), - m_evaluations(std::move(evaluations)), - m_fold_evaluations(std::move(fold_evaluations)), - m_sample_evaluations(std::move(sample_evaluations)), - m_data_content(data_content), - m_output_files(std::move(output_files)), m_task(std::move(task)), - m_flow(std::move(flow)), m_run_id(run_id), - m_model(std::move(model)), m_tags(std::move(tags)), - m_predictions_url(std::move(predictions_url)) - { - } - - static std::shared_ptr - from_filesystem(const std::string& directory); - - static std::shared_ptr run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task); - - static std::shared_ptr run_model_on_task( - std::shared_ptr model, std::shared_ptr task); - - void to_filesystem(const std::string& directory) const; - - void publish() const; - - private: - std::string m_uploader; - std::string m_uploader_name; - std::string m_setup_id; - std::string m_setup_string; - std::string m_parameter_settings; - std::vector m_evaluations; - std::vector m_fold_evaluations; - std::vector m_sample_evaluations; - std::string m_data_content; - std::vector m_output_files; - std::shared_ptr m_task; - std::shared_ptr m_flow; - std::string m_run_id; - std::shared_ptr m_model; - std::vector m_tags; - std::string m_predictions_url; - }; -} // namespace shogun - -#endif // SHOGUN_OPENMLFLOW_H diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp new file mode 100644 index 00000000000..66dc3dcf482 --- /dev/null +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -0,0 +1,255 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +#include +#include +#include + +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + +std::shared_ptr +OpenMLData::get_dataset(const std::string& id, const std::string& api_key) +{ + // description + Document document; + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("dataset_description", "json", id); + + document.Parse(return_string.c_str()); + check_response(document, "data_set_description"); + + const Value& dataset_description = document["data_set_description"]; + + auto name = return_if_possible( + "name", dataset_description.GetObject()); + auto description = return_if_possible( + "description", dataset_description.GetObject()); + auto data_format = return_if_possible( + "data_format", dataset_description.GetObject()); + auto dataset_id = + return_if_possible("id", dataset_description.GetObject()); + auto version = return_if_possible( + "version", dataset_description.GetObject()); + auto creator = return_if_possible( + "creator", dataset_description.GetObject()); + auto contributor = return_if_possible( + "contributor", dataset_description.GetObject()); + auto collection_date = return_if_possible( + "collection_date", dataset_description.GetObject()); + auto upload_date = return_if_possible( + "upload_date", dataset_description.GetObject()); + auto language = return_if_possible( + "language", dataset_description.GetObject()); + auto licence = return_if_possible( + "licence", dataset_description.GetObject()); + auto url = + return_if_possible("url", dataset_description.GetObject()); + auto default_target_attribute = return_if_possible( + "default_target_attribute", dataset_description.GetObject()); + auto row_id_attribute = return_if_possible( + "row_id_attribute", dataset_description.GetObject()); + auto ignore_attribute = return_if_possible( + "ignore_attribute", dataset_description.GetObject()); + auto version_label = return_if_possible( + "version_label", dataset_description.GetObject()); + auto citation = return_if_possible( + "citation", dataset_description.GetObject()); + auto tags = return_if_possible>( + "tag", dataset_description.GetObject()); + auto visibility = return_if_possible( + "visibility", dataset_description.GetObject()); + auto original_data_url = return_if_possible( + "original_data_url", dataset_description.GetObject()); + auto paper_url = return_if_possible( + "paper_url", dataset_description.GetObject()); + auto update_comment = return_if_possible( + "update_comment", dataset_description.GetObject()); + auto md5_checksum = return_if_possible( + "md5_checksum", dataset_description.GetObject()); + + // features + std::vector>> + param_vector; + return_string = reader.get("data_features", "json", id); + document.Parse(return_string.c_str()); + check_response(document, "data_features"); + const Value& dataset_features = document["data_features"]; + for (const auto& param : dataset_features["feature"].GetArray()) + { + std::unordered_map> param_map; + for (const auto& param_descriptors : param.GetObject()) + { + std::vector second; + if (param_descriptors.value.IsArray()) + for (const auto& v : param_descriptors.value.GetArray()) + second.emplace_back(v.GetString()); + else + second.emplace_back(param_descriptors.value.GetString()); + + param_map.emplace(param_descriptors.name.GetString(), second); + } + param_vector.push_back(param_map); + } + + // qualities + std::vector> qualities_vector; + return_string = reader.get("data_qualities", "json", id); + document.Parse(return_string.c_str()); + check_response(document, "data_qualities"); + const Value& data_qualities = document["data_qualities"]; + for (const auto& param : data_qualities["quality"].GetArray()) + { + std::unordered_map param_map; + for (const auto& param_quality : param.GetObject()) + { + if (param_quality.name.IsString() && param_quality.value.IsString()) + param_map.emplace( + param_quality.name.GetString(), + param_quality.value.GetString()); + else if (param_quality.name.IsString()) + param_map.emplace(param_quality.name.GetString(), ""); + } + qualities_vector.push_back(param_map); + } + + auto result = std::make_shared( + name, description, data_format, dataset_id, version, creator, + contributor, collection_date, upload_date, language, licence, url, + default_target_attribute, row_id_attribute, ignore_attribute, + version_label, citation, tags, visibility, original_data_url, paper_url, + update_comment, md5_checksum, param_vector, qualities_vector); + result->set_api_key(api_key); + return result; +} + +std::shared_ptr OpenMLData::get_features() noexcept +{ + if (!m_cached_features) + get_data(); + return m_cached_features; +} + +std::shared_ptr OpenMLData::get_features(const std::string& label) +{ + if (!m_cached_features) + get_data(); + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label); + if (find_label == m_feature_names.end()) + SG_SERROR("Requested label \"%s\" not in the dataset!\n", label.c_str()) + if (!m_cached_features) + get_data(); + auto col_idx = std::distance(m_feature_names.begin(), find_label); + auto feat_type_copy = m_feature_types; + feat_type_copy.erase(feat_type_copy.begin() + col_idx); + for (const auto type : feat_type_copy) + { + if (type == ARFFDeserializer::Attribute::STRING) + SG_SERROR("Currently cannot process string features!\n") + } + std::shared_ptr result; + bool first = true; + for (int i = 0; i < m_feature_types.size(); ++i) + { + if (i != col_idx && first) + { + result.reset(m_cached_features->get_feature_obj(i)); + first = false; + } + if (i != col_idx) + result.reset(result->create_merged_copy( + m_cached_features->get_feature_obj(i))); + } + std::dynamic_pointer_cast>(result)->set_num_features(m_feature_types.size()); + std::dynamic_pointer_cast>(result)->set_num_vectors(m_cached_features->get_num_vectors()); + + return result; +} + +std::shared_ptr OpenMLData::get_labels() +{ + if (!m_cached_features) + get_data(); + REQUIRE( + !m_default_target_attribute.empty(), + "A default target attribute is required if no label is given!\n") + return get_labels(m_default_target_attribute); +} + +std::shared_ptr OpenMLData::get_labels(const std::string& label_name) +{ + if (!m_cached_features) + get_data(); + auto find_label = + std::find(m_feature_names.begin(), m_feature_names.end(), label_name); + if (find_label == m_feature_names.end()) + SG_SERROR( + "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) + auto col_idx = std::distance(m_feature_names.begin(), find_label); + + if (!m_cached_features) + get_data(); + + auto target_label_as_feat = + std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); + + switch (m_feature_types[col_idx]) + { + // real features + case ARFFDeserializer::Attribute::REAL: + case ARFFDeserializer::Attribute::NUMERIC: + case ARFFDeserializer::Attribute::INTEGER: + case ARFFDeserializer::Attribute::DATE: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + auto labels = std::make_shared(labels_vec); + return labels; + } + break; + // nominal features + case ARFFDeserializer::Attribute::NOMINAL: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + for(auto& val: labels_vec) + { + if (val == 0) + val = -1; + } + auto labels = std::make_shared(labels_vec); + return labels; + } + break; + default: + SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) + } + + return nullptr; +} + +void OpenMLData::get_data() +{ + auto reader = OpenMLReader(m_api_key); + std::shared_ptr ss = + std::make_shared(reader.get(m_url)); + + auto parser = ARFFDeserializer(ss); + parser.read(); + m_cached_features = parser.get_features(); + m_feature_names = parser.get_feature_names(); + m_feature_types = parser.get_attribute_types(); +} diff --git a/src/shogun/io/openml/OpenMLData.h b/src/shogun/io/openml/OpenMLData.h new file mode 100644 index 00000000000..7e21474775b --- /dev/null +++ b/src/shogun/io/openml/OpenMLData.h @@ -0,0 +1,162 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLDATA_H +#define SHOGUN_OPENMLDATA_H + +#include +#include + +#include +#include +#include + +namespace shogun +{ + /** + * Handles an OpenML dataset. + */ + class OpenMLData + { + public: + OpenMLData( + const std::string& name, const std::string& description, + const std::string& data_format, const std::string& dataset_id, + const std::string& version, const std::string& creator, + const std::string& contributor, const std::string& collection_date, + const std::string& upload_date, const std::string& language, + const std::string& license, const std::string& url, + const std::string& default_target_attribute, + const std::string& row_id_attribute, + const std::string& ignore_attribute, + const std::string& version_label, const std::string& citation, + std::vector tag, const std::string& visibility, + const std::string& original_data_url, const std::string& paper_url, + const std::string& update_comment, const std::string& md5_checksum, + std::vector< + std::unordered_map> + + > + param_descriptors, + std::vector> + param_qualities) + : + + m_name(name), m_description(description), + m_data_format(data_format), m_dataset_id(dataset_id), + m_version(version), m_creator(creator), + m_contributor(contributor), m_collection_date(collection_date), + m_upload_date(upload_date), m_language(language), + m_license(license), m_url(url), + m_default_target_attribute(default_target_attribute), + m_row_id_attribute(row_id_attribute), + m_ignore_attribute(ignore_attribute), + m_version_label(version_label), m_citation(citation), + m_tag(std::move(tag)), m_visibility(visibility), + m_original_data_url(original_data_url), m_paper_url(paper_url), + m_update_comment(update_comment), m_md5_checksum(md5_checksum), + m_param_descriptors(std::move(param_descriptors)), + m_param_qualities(std::move(param_qualities)) + { + } + + /** + * Creates a dataset instance from a given ID. + * + */ + static std::shared_ptr + get_dataset(const std::string& id, const std::string& api_key); + + /** + * Returns ALL the features of the dataset, potentially also the labels + * column + * @return the features + */ + std::shared_ptr get_features() noexcept; + + /** + * Returns the dataset features + * @param label_name the name of the attribute containing the label + * @return the features + */ + std::shared_ptr get_features(const std::string& label_name); + + /** + * Returns the dataset labels if m_default_target_attribute is not empty + * @return the labels + */ + std::shared_ptr get_labels(); + + /** + * Returns the dataset labels given the label_name + * @return the labels + */ + std::shared_ptr get_labels(const std::string& label_name); + + /** + * Returns the type of all attributes/features in the ARFF file + * @return + */ + SG_FORCED_INLINE std::vector + + get_feature_types() const noexcept + { + return m_feature_types; + } + + SG_FORCED_INLINE std::string + + get_default_target_attribute() const noexcept + { + return m_default_target_attribute; + } + + protected: + SG_FORCED_INLINE void set_api_key(const std::string& api_key) noexcept + { + m_api_key = api_key; + } + + private: + void get_data(); + + std::string m_name; + std::string m_description; + std::string m_data_format; + std::string m_dataset_id; + std::string m_version; + std::string m_creator; + std::string m_contributor; + std::string m_collection_date; + std::string m_upload_date; + std::string m_language; + std::string m_license; + std::string m_url; + std::string m_default_target_attribute; + std::string m_row_id_attribute; + std::string m_ignore_attribute; + std::string m_version_label; + std::string m_citation; + std::vector m_tag; + std::string m_visibility; + std::string m_original_data_url; + std::string m_paper_url; + std::string m_update_comment; + std::string m_md5_checksum; + std::vector>> + m_param_descriptors; + std::vector> + m_param_qualities; + std::string m_api_key; + + std::shared_ptr m_cached_features; + std::vector m_feature_names; + std::vector m_feature_types; + }; + +} // namespace shogun + +#endif // SHOGUN_OPENMLDATA_H diff --git a/src/shogun/io/openml/OpenMLFlow.cpp b/src/shogun/io/openml/OpenMLFlow.cpp new file mode 100644 index 00000000000..dd2f3f839a6 --- /dev/null +++ b/src/shogun/io/openml/OpenMLFlow.cpp @@ -0,0 +1,115 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#include +#include + +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + + +std::shared_ptr OpenMLFlow::download_flow( + const std::string& flow_id, const std::string& api_key) +{ + Document document; + parameters_type params; + components_type components; + std::string name; + std::string description; + std::string class_name; + + // get flow and parse with RapidJSON + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("flow_file", "json", flow_id); + document.Parse(return_string.c_str()); + check_response(document, "flow"); + + // store root for convenience. We know it exists from previous check. + const Value& root = document["flow"]; + + // handle parameters + if (root.HasMember("parameter")) + { + std::unordered_map param_dict; + + if (root["parameter"].IsArray()) + { + for (const auto& v : root["parameter"].GetArray()) + { + emplace_string_to_map(v, param_dict, "data_type"); + emplace_string_to_map(v, param_dict, "default_value"); + emplace_string_to_map(v, param_dict, "description"); + params.emplace(v["name"].GetString(), param_dict); + param_dict.clear(); + } + } + else + { + // parameter can also be a dict, instead of array + const auto v = root["parameter"].GetObject(); + emplace_string_to_map(v, param_dict, "data_type"); + emplace_string_to_map(v, param_dict, "default_value"); + emplace_string_to_map(v, param_dict, "description"); + params.emplace(v["name"].GetString(), param_dict); + } + } + + // handle components, i.e. kernels + if (root.HasMember("component")) + { + if (root["component"].IsArray()) + { + for (const auto& v : root["component"].GetArray()) + { + components.emplace( + v["identifier"].GetString(), + OpenMLFlow::download_flow( + v["flow"]["id"].GetString(), api_key)); + } + } + else + { + components.emplace( + root["component"]["identifier"].GetString(), + OpenMLFlow::download_flow( + root["component"]["flow"]["id"].GetString(), api_key)); + } + } + + // get remaining information from flow + if (root.HasMember("name")) + name = root["name"].GetString(); + if (root.HasMember("description")) + description = root["description"].GetString(); + if (root.HasMember("class_name")) + class_name = root["class_name"].GetString(); + + auto flow = std::make_shared( + name, description, class_name, components, params); + + return flow; +} + +void OpenMLFlow::upload_flow(const std::shared_ptr& flow) +{ + SG_SNOTIMPLEMENTED; +} + +void OpenMLFlow::dump() const +{ + SG_SNOTIMPLEMENTED; +} + +std::shared_ptr OpenMLFlow::from_file() +{ + SG_SNOTIMPLEMENTED; + return std::shared_ptr(); +} \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLFlow.h b/src/shogun/io/openml/OpenMLFlow.h new file mode 100644 index 00000000000..e8424e58343 --- /dev/null +++ b/src/shogun/io/openml/OpenMLFlow.h @@ -0,0 +1,142 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLFLOW_H +#define SHOGUN_OPENMLFLOW_H + +#include + +#include +#include + + +namespace shogun +{ + /** + * Handles OpenML flows. A flow contains the information + * required to instantiate a model. + */ + class OpenMLFlow + { + + public: + /** alias for component type, map of flows */ + using components_type = + std::unordered_map>; + /** alias for parameter type, map of maps with information specific to a + * parameter */ + using parameters_type = std::unordered_map< + std::string, std::unordered_map>; + + /** + * The OpenMLFlow constructor. This constructor is rarely used by the + * user and is used by the static class members download_flow and + * from_file. The user is expected to use either of the previously + * mentioned functions. + * + * @param name the model name + * @param description the model description + * @param model the flow class_name field + * @param components a map of subflows, i.e. kernels + * @param parameters a map of parameter information, i.e. default values + * for each parameter name + */ + OpenMLFlow( + const std::string& name, const std::string& description, + const std::string& model, components_type components, + parameters_type parameters) + : m_name(name), m_description(description), m_class_name(model), + m_parameters(std::move(parameters)), + m_components(std::move(components)) + { + } + + /** + * Instantiates a OpenMLFlow by downloaded a flow from the OpenML + * server. + * + * @param flow_id the flow ID + * @param api_key the user API key (might not be required and can be an + * empty string) + * @return the OpenMLFlow corresponding to the flow requested + * @throws ShogunException when there is a server error or the requested + * flow is ill formed. + */ + static std::shared_ptr + download_flow(const std::string& flow_id, const std::string& api_key); + + /** + * Instantiates a OpenMLFlow from a file. + * @return the OpenMLFlow corresponding to the flow requested + */ + static std::shared_ptr from_file(); + + /** + * Publishes a flow to the OpenML server + * @param flow the flow to be published + */ + static void upload_flow(const std::shared_ptr& flow); + + /** + * Dumps the OpenMLFlow to disk. + */ + void dump() const; + + /** + * Gets a subflow, i.e. a kernel in a machine + * @param name the name of the subflow, not the flow ID + * @return the subflow if it exists + */ + std::shared_ptr get_subflow(const std::string& name) + { + auto find_flow = m_components.find(name); + if (find_flow != m_components.end()) + return find_flow->second; + else + SG_SERROR( + "The provided subflow could not be found in this flow!") + return nullptr; + } + +#ifndef SWIG + SG_FORCED_INLINE parameters_type + + get_parameters() const noexcept + { + return m_parameters; + } + + SG_FORCED_INLINE components_type + + get_components() const noexcept + { + return m_components; + } + + SG_FORCED_INLINE std::string + + get_class_name() const noexcept + { + return m_class_name; + } + +#endif // SWIG + + private: + /** name field of the flow */ + std::string m_name; + /** description field of the flow */ + std::string m_description; + /** the class_name field of the flow */ + std::string m_class_name; + /** the parameter field of the flow (optional) */ + parameters_type m_parameters; + /** the components fields of the flow (optional) */ + components_type m_components; + }; +} // namespace shogun + +#endif // SHOGUN_OPENMLFLOW_H diff --git a/src/shogun/io/openml/OpenMLReader.cpp b/src/shogun/io/openml/OpenMLReader.cpp new file mode 100644 index 00000000000..bd9d1b7de34 --- /dev/null +++ b/src/shogun/io/openml/OpenMLReader.cpp @@ -0,0 +1,97 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#ifdef HAVE_CURL +#include +#endif // HAVE_CURL + +using namespace shogun; + +/** + * The writer callback function used to write the packets to a C++ string. + * @param data the data received in CURL request + * @param size always 1 + * @param nmemb the size of data + * @param buffer_in the buffer to write to + * @return the size of buffer that was written + */ +size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) +{ + // check that the buffer string points to something + if (buffer_in != nullptr) + { + // Append the data to the buffer + buffer_in->append(data, size * nmemb); + + return size * nmemb; + } + return 0; +} + +/* OpenML server format */ +const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; +const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLReader::download_server = ""; +const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; + +/* DATA API */ +const char* OpenMLReader::dataset_description = "/data/{}"; +const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; +const char* OpenMLReader::data_features = "/data/features/{}"; +const char* OpenMLReader::data_qualities = "/data/qualities/{}"; +const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; +const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; +/* FLOW API */ +const char* OpenMLReader::flow_file = "/flow/{}"; +/* TASK API */ +const char* OpenMLReader::task_file = "/task/{}"; +/* SPLIT API */ +const char* OpenMLReader::get_split = "/get/{}"; + +const std::unordered_map + OpenMLReader::m_format_options = {{"xml", xml_server}, + {"json", json_server}, + {"split", splits_server}, + {"download", download_server}}; +const std::unordered_map + OpenMLReader::m_request_options = { + {"dataset_description", dataset_description}, + {"list_data_qualities", list_data_qualities}, + {"data_features", data_features}, + {"data_qualities", data_qualities}, + {"list_dataset_qualities", list_dataset_qualities}, + {"list_dataset_filter", list_dataset_filter}, + {"flow_file", flow_file}, + {"task_file", task_file}}; + +void OpenMLReader::openml_curl_request_helper(const std::string& url) +{ +#ifdef HAVE_CURL + CURL* curl_handle = nullptr; + + curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.\n") + return; + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); + + CURLcode res = curl_easy_perform(curl_handle); + + if (res != CURLE_OK) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) + + curl_easy_cleanup(curl_handle); +#endif // HAVE_CURL +} diff --git a/src/shogun/io/openml/OpenMLReader.h b/src/shogun/io/openml/OpenMLReader.h new file mode 100644 index 00000000000..8078b1e78ec --- /dev/null +++ b/src/shogun/io/openml/OpenMLReader.h @@ -0,0 +1,183 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLREADER_H +#define SHOGUN_OPENMLREADER_H + +#include +#include + +#include +#include +#include +#include + +namespace shogun +{ + /** + * Reads OpenML streams which can be downloaded with this function. + */ + class OpenMLReader + { + + public: + explicit OpenMLReader(const std::string& api_key) : m_api_key(api_key) + { + } + + /** + * Returns a string returned by the server given a request. + * Raises an error if the returned code is not 200. + * Additional arguments can be passed to the request, + * which are then concatenated with a "/" character. + * + * @tparam Args argument type pack, should all be std::string + * @param request the request name, see m_request_options + * @param format the format to return the data in, see m_format_options + * @param args the additional arguments to be passed to request + * @return the returned stream from the server if the return code is 200 + */ + template + std::string + get(const std::string& request, const std::string& format, Args... args) + { + std::string request_path; + auto find_format = m_format_options.find(format); + if (find_format == m_format_options.end()) + { + SG_SERROR( + "The provided format \"%s\" is not available\n", + format.c_str()) + } + + if (format == "split") + { + REQUIRE( + request == "get_split", + "Split server can only handle \"get_split\" request.\n") + request_path = get_split; + } + else + { + auto find_request = m_request_options.find(request); + if (find_request == m_request_options.end()) + { + SG_SERROR( + "Could not find a way to solve the request \"%s\"\n", + request.c_str()) + } + request_path = find_request->second; + } + + std::string request_format = find_format->second; + + // get additional args and concatenate them with "/" + if (sizeof...(Args) > 0) + { + if (request_path.substr(request_path.size() - 2) == "{}") + { + request_path = + request_path.substr(0, request_path.size() - 2); + } + else + { + SG_SERROR( + "The provided request \"%s\" cannot handle additional " + "args.\n", + request.c_str()) + } + std::vector args_vec = {args...}; + std::string args_string = std::accumulate( + args_vec.begin() + 1, args_vec.end(), args_vec.front(), + [](std::string s0, std::string& s1) { + return s0 += "/" + s1; + }); + request_path += args_string; + } + + std::string url = request_format + request_path + "?" + m_api_key; + + return get(url); + } + + std::string get(const std::string& url) + { +#ifdef HAVE_CURL + // clear the buffer before request + m_curl_response_buffer.clear(); + + openml_curl_request_helper(url); + return m_curl_response_buffer; +#else + SG_SERROR( + "This function is only available with the CURL library!\n") +#endif // HAVE_CURL + } + + private: + /** the raw buffer as a C++ string */ + std::string m_curl_response_buffer; + + /** + * Initialises CURL session and gets the data. + * This function also handles the response code from the server. + * + * @param url the url to query + */ + void openml_curl_request_helper(const std::string& url); + + /** the user API key, not required for all requests */ + std::string m_api_key; + + /** the server path to get a response in XML format*/ + static const char* xml_server; + /** the server path to get a response in JSON format*/ + static const char* json_server; + /** the server path to download datasets */ + static const char* download_server; + /** the server path to get a split in ARFF format */ + static const char* splits_server; + + /** the server response format options: XML or JSON */ + static const std::unordered_map + m_format_options; + /** all the supported server options */ + static const std::unordered_map + m_request_options; + + /* DATA API */ + static const char* dataset_description; + static const char* list_data_qualities; + static const char* data_features; + static const char* data_qualities; + static const char* list_dataset_qualities; + static const char* list_dataset_filter; + + /* FLOW API */ + static const char* flow_file; + + /* TASK API */ + static const char* task_file; + + /* SPLIT API */ + static const char* get_split; + }; + + /** + * Writes OpenML streams to the OpenML server. + */ + class OpenMLWritter + { + public: + OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; + + private: + /** the user API key, likely to be needed to write to OpenML */ + std::string m_api_key; + }; +} // namespace shogun + +#endif // SHOGUN_OPENMLREADER_H diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp new file mode 100644 index 00000000000..796545e10de --- /dev/null +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -0,0 +1,68 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include +#include + +using namespace shogun; + +std::shared_ptr OpenMLRun::run_model_on_task( + std::shared_ptr model, std::shared_ptr task) +{ + SG_SNOTIMPLEMENTED + return std::shared_ptr(); +} + +std::shared_ptr OpenMLRun::run_flow_on_task( + std::shared_ptr flow, std::shared_ptr task) +{ + auto data = task->get_dataset(); + std::shared_ptr train_features = nullptr, test_features = nullptr; + std::shared_ptr train_labels = nullptr, test_labels = nullptr; + + if (task->get_split()->contains_splits()) + SG_SNOTIMPLEMENTED + else + { + train_labels = data->get_labels(); + train_features = + data->get_features(data->get_default_target_attribute()); + // ensures delete is called by shared ptr destructor + SG_REF(train_labels.get()) + SG_REF(train_features.get()) + auto model = ShogunOpenML::flow_to_model(std::move(flow), true); + + if (auto machine = std::dynamic_pointer_cast(model)) + { + auto result = ShogunOpenML::run_model_on_fold( + machine, task, train_features, 0, 0, train_labels, + test_features); + SG_SDEBUG(result->to_string().c_str()); + } + else + SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") + } + return std::shared_ptr(); +} + +std::shared_ptr +OpenMLRun::from_filesystem(const std::string& directory) +{ + SG_SNOTIMPLEMENTED + return nullptr; +} + +void OpenMLRun::to_filesystem(const std::string& directory) const +{ + SG_SNOTIMPLEMENTED +} + +void OpenMLRun::publish() const +{ + SG_SNOTIMPLEMENTED +} diff --git a/src/shogun/io/openml/OpenMLRun.h b/src/shogun/io/openml/OpenMLRun.h new file mode 100644 index 00000000000..9046d09bea9 --- /dev/null +++ b/src/shogun/io/openml/OpenMLRun.h @@ -0,0 +1,78 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLRUN_H +#define SHOGUN_OPENMLRUN_H + +#include + +#include +#include + +namespace shogun { + class OpenMLRun + { + public: + OpenMLRun( + const std::string& uploader, const std::string& uploader_name, + const std::string& setup_id, const std::string& setup_string, + const std::string& parameter_settings, + std::vector evaluations, + std::vector fold_evaluations, + std::vector sample_evaluations, + const std::string& data_content, + std::vector output_files, + std::shared_ptr task, std::shared_ptr flow, + const std::string& run_id, std::shared_ptr model, + std::vector tags, std::string predictions_url) + : m_uploader(uploader), m_uploader_name(uploader_name), + m_setup_id(setup_id), m_setup_string(setup_string), + m_parameter_settings(parameter_settings), + m_evaluations(std::move(evaluations)), + m_fold_evaluations(std::move(fold_evaluations)), + m_sample_evaluations(std::move(sample_evaluations)), + m_data_content(data_content), + m_output_files(std::move(output_files)), m_task(std::move(task)), + m_flow(std::move(flow)), m_run_id(run_id), + m_model(std::move(model)), m_tags(std::move(tags)), + m_predictions_url(std::move(predictions_url)) + { + } + + static std::shared_ptr + from_filesystem(const std::string& directory); + + static std::shared_ptr run_flow_on_task( + std::shared_ptr flow, std::shared_ptr task); + + static std::shared_ptr run_model_on_task( + std::shared_ptr model, std::shared_ptr task); + + void to_filesystem(const std::string& directory) const; + + void publish() const; + + private: + std::string m_uploader; + std::string m_uploader_name; + std::string m_setup_id; + std::string m_setup_string; + std::string m_parameter_settings; + std::vector m_evaluations; + std::vector m_fold_evaluations; + std::vector m_sample_evaluations; + std::string m_data_content; + std::vector m_output_files; + std::shared_ptr m_task; + std::shared_ptr m_flow; + std::string m_run_id; + std::shared_ptr m_model; + std::vector m_tags; + std::string m_predictions_url; + }; +} + +#endif //SHOGUN_OPENMLRUN_H diff --git a/src/shogun/io/openml/OpenMLSplit.cpp b/src/shogun/io/openml/OpenMLSplit.cpp new file mode 100644 index 00000000000..c4481a860cd --- /dev/null +++ b/src/shogun/io/openml/OpenMLSplit.cpp @@ -0,0 +1,112 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +#include +#include + +using namespace shogun; + +std::shared_ptr +OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) +{ + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("get_split", "split", split_url); + + if (return_string == "Task not providing datasplits.") + return std::make_shared(); + + auto return_stream = std::istringstream(return_string); + // TODO: add ARFF parsing here + // get train/test indices + // TODO: replace line below with ARFFDeserialiser::get_features() + auto arff_features = std::make_shared(); + REQUIRE( + arff_features->get_num_feature_obj() == 4, + "Expected a ARFF file with 4 attributes: type, rowid, repeat and " + "fold.\n") + + auto train_test_feat = + std::shared_ptr(arff_features->get_feature_obj(0)); + auto rowid_feat = + std::shared_ptr(arff_features->get_feature_obj(1)); + auto repeat_feat = + std::shared_ptr(arff_features->get_feature_obj(2)); + auto fold_feat = + std::shared_ptr(arff_features->get_feature_obj(3)); + + auto type_vector = string_feature_to_vector(train_test_feat); + auto rowid_vector = dense_feature_to_vector(rowid_feat); + auto repeat_vector = dense_feature_to_vector(repeat_feat); + auto fold_vector = dense_feature_to_vector(fold_feat); + + std::vector> train_idx, test_idx; + for (int i = 0; i < arff_features->get_num_vectors(); ++i) + { + if (type_vector[i] == LabelType::TRAIN) + train_idx.emplace_back(std::initializer_list{ + static_cast(rowid_vector[i]), + static_cast(repeat_vector[i]), + static_cast(fold_vector[i])}); + else + test_idx.emplace_back(std::initializer_list{ + static_cast(rowid_vector[i]), + static_cast(repeat_vector[i]), + static_cast(fold_vector[i])}); + } + + return std::make_shared(train_idx, test_idx); +} + +SGVector +OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = + std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CDenseFeatures!\n>"); + return casted_feat->get_feature_vector(0); +} + +std::vector +OpenMLSplit::string_feature_to_vector(const std::shared_ptr& feat) +{ + auto casted_feat = std::dynamic_pointer_cast>(feat); + // this should never happen + if (!casted_feat) + SG_SERROR("Error casting a column in the split file from CFeatures to " + "CStringFeatures!\n"); + + auto to_lower = [](const std::string& line) { + std::string result; + std::transform( + line.begin(), line.end(), std::back_inserter(result), + [](uint8_t val) { return std::tolower(val); }); + return result; + }; + + std::vector result; + + for (int i = 0; i < casted_feat->get_num_vectors(); ++i) + { + auto row = casted_feat->get_feature_vector(i); + std::string label(1, row[0]); + for (auto j = 1; j < casted_feat->get_max_vector_length(); ++j) + label.append(1, row[j]); + if (to_lower(label) == "train") + result.push_back(LabelType::TRAIN); + else if (to_lower(label) == "test") + result.push_back(LabelType::TEST); + else + SG_SERROR("Unknown label type in split file %s!\n", label.c_str()) + } + return result; +} \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLSplit.h b/src/shogun/io/openml/OpenMLSplit.h new file mode 100644 index 00000000000..ab61aebdd77 --- /dev/null +++ b/src/shogun/io/openml/OpenMLSplit.h @@ -0,0 +1,71 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLSPLIT_H +#define SHOGUN_OPENMLSPLIT_H + +#include +#include + +namespace shogun +{ + /** + * Handles an OpenML split. + */ + class OpenMLSplit + { + public: + enum class LabelType + { + TRAIN = 1, + TEST = 2 + }; + + /** + * Default constructor. This is used when there are no + * train or test indices. + */ + OpenMLSplit() = default; + + OpenMLSplit( + std::vector> train_idx, + std::vector> test_idx) + : m_train_idx(std::move(train_idx)), m_test_idx(std::move(test_idx)) + { + } + + static std::shared_ptr + get_split(const std::string& split_url, const std::string& api_key); + + SG_FORCED_INLINE std::vector> get_train_idx() const + noexcept + { + return m_train_idx; + } + + SG_FORCED_INLINE std::vector> get_test_idx() const + noexcept + { + return m_test_idx; + } + + SG_FORCED_INLINE bool contains_splits() const noexcept + { + return !m_train_idx.empty() && !m_test_idx.empty(); + } + + private: + static SGVector + dense_feature_to_vector(const std::shared_ptr& feat); + + static std::vector + string_feature_to_vector(const std::shared_ptr& feat); + + std::vector> m_train_idx; + std::vector> m_test_idx; + }; +} // namespace shogun +#endif // SHOGUN_OPENMLSPLIT_H diff --git a/src/shogun/io/openml/OpenMLTask.cpp b/src/shogun/io/openml/OpenMLTask.cpp new file mode 100644 index 00000000000..493664a8367 --- /dev/null +++ b/src/shogun/io/openml/OpenMLTask.cpp @@ -0,0 +1,137 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include +#include +#include + +using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; + +std::shared_ptr +OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) +{ + Document document; + std::string task_name; + std::string task_type_id; + std::shared_ptr openml_dataset = nullptr; + std::shared_ptr openml_split = nullptr; + std::unordered_map evaluation_measures; + + auto reader = OpenMLReader(api_key); + auto return_string = reader.get("task_file", "json", task_id); + + document.Parse(return_string.c_str()); + check_response(document, "task"); + + const Value& root = document["task"]; + + REQUIRE( + task_id == root["task_id"].GetString(), + "Expected downloaded task to have the same id as the requested task " + "id, but got \"%s\", instead of \"%s\".\n", + root["task_id"].GetString(), task_id.c_str()) + + task_name = root["task_name"].GetString(); + OpenMLTask::TaskType task_type = + get_task_from_string(root["task_type"].GetString()); + task_type_id = root["task_type_id"].GetString(); + + // expect two elements in input array: dataset and split + const Value& json_input = root["input"]; + + auto input_array = json_input.GetArray(); + + for (const auto& task_settings : input_array) + { + if (strcmp(task_settings["name"].GetString(), "source_data") == 0) + { + auto dataset_info = task_settings["data_set"].GetObject(); + std::string dataset_id = dataset_info["data_set_id"].GetString(); + std::string target_feature = + dataset_info["target_feature"].GetString(); + openml_dataset = OpenMLData::get_dataset(dataset_id, api_key); + } + else if ( + strcmp(task_settings["name"].GetString(), "estimation_procedure") == + 0) + { + auto split_info = task_settings["estimation_procedure"].GetObject(); + std::string split_id = split_info["id"].GetString(); + std::string split_type = split_info["type"].GetString(); + std::string split_url = split_info["data_splits_url"].GetString(); + std::unordered_map split_parameters; + for (const auto& param : split_info["parameter"].GetArray()) + { + if (param.HasMember("name") && param.HasMember("value")) + split_parameters.emplace( + param["name"].GetString(), param["value"].GetString()); + else if (param.HasMember("name")) + split_parameters.emplace(param["name"].GetString(), ""); + else + SG_SERROR( + "Unexpected number of parameters in parameter array " + "of estimation_procedure.\n") + } + REQUIRE( + split_type == "crossvalidation", + "Currently only tasks with cross validation are enabled in " + "shogun!\n") + openml_split = OpenMLSplit::get_split(split_url, api_key); + } + else if ( + strcmp(task_settings["name"].GetString(), "evaluation_measures") == + 0) + { + auto evaluation_info = + task_settings["evaluation_measures"].GetObject(); + for (const auto& param : evaluation_info) + { + if (param.value.IsString()) + evaluation_measures.emplace( + param.name.GetString(), param.value.GetString()); + else + evaluation_measures.emplace(param.name.GetString(), ""); + } + } + } + + if (openml_dataset == nullptr && openml_split == nullptr) + SG_SERROR("Error parsing task.\n") + + auto result = std::make_shared( + task_id, task_name, task_type, task_type_id, evaluation_measures, + openml_split, openml_dataset); + + return result; +} + +OpenMLTask::TaskType +OpenMLTask::get_task_from_string(const std::string& task_type) +{ + if (task_type == "Supervised Classification") + return OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION; + SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) +} + +std::vector> OpenMLTask::get_train_indices() const +{ + return get_indices(m_split->get_train_idx()); +} + +std::vector> OpenMLTask::get_test_indices() const +{ + return get_indices(m_split->get_test_idx()); +} + +std::vector> +OpenMLTask::get_indices(const std::vector>& idx) const +{ + SG_SNOTIMPLEMENTED + std::vector> result; + return result; +} \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLTask.h b/src/shogun/io/openml/OpenMLTask.h new file mode 100644 index 00000000000..a849ea37de9 --- /dev/null +++ b/src/shogun/io/openml/OpenMLTask.h @@ -0,0 +1,95 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENMLTASK_H +#define SHOGUN_OPENMLTASK_H + +#include +#include + +namespace shogun +{ + /** + * Handles OpenML tasks. A task contains all the information + * required to train and test a model. + */ + class OpenMLTask + { + public: + enum class TaskType + { + SUPERVISED_CLASSIFICATION = 0, + SUPERVISED_REGRESSION = 1, + LEARNING_CURVE = 2, + SUPERVISED_DATASTREAM_CLASSIFICATION = 3, + CLUSTERING = 4, + MACHINE_LEARNING_CHALLENGE = 5, + SURVIVAL_ANALYSIS = 6, + SUBGROUP_DISCOVERY = 7 + }; + + enum class TaskEvaluation + { + + }; + + OpenMLTask( + const std::string& task_id, const std::string task_name, + TaskType task_type, const std::string& task_type_id, + std::unordered_map evaluation_measures, + std::shared_ptr split, + std::shared_ptr data) + : m_task_id(task_id), m_task_name(task_name), + m_task_type(task_type), m_task_type_id(task_type_id), + m_evaluation_measures(std::move(evaluation_measures)), + m_split(std::move(split)), m_data(std::move(data)) + { + } + + static std::shared_ptr + get_task(const std::string& task_id, const std::string& api_key); + + std::shared_ptr get_dataset() const noexcept + { + return m_data; + } + + std::shared_ptr get_split() const noexcept + { + return m_split; + } + + std::vector> get_train_indices() const; + + std::vector> get_test_indices() const; + +#ifndef SWIG + SG_FORCED_INLINE TaskType + + get_task_type() const noexcept + { + return m_task_type; + } + +#endif // SWIG + + private: + static TaskType get_task_from_string(const std::string& task_type); + + std::vector> + get_indices(const std::vector>& idx) const; + + std::string m_task_id; + std::string m_task_name; + TaskType m_task_type; + std::string m_task_type_id; + std::unordered_map m_evaluation_measures; + std::shared_ptr m_split; + std::shared_ptr m_data; + }; +} // namespace shogun + +#endif // SHOGUN_OPENMLTASK_H diff --git a/src/shogun/io/openml/ShogunOpenML.cpp b/src/shogun/io/openml/ShogunOpenML.cpp new file mode 100644 index 00000000000..80dbc03810f --- /dev/null +++ b/src/shogun/io/openml/ShogunOpenML.cpp @@ -0,0 +1,340 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#include + +using namespace shogun; + +/** + * Class using the Any visitor pattern to convert + * a string to a C++ type that can be used as a parameter + * in a Shogun model. If the string value is not "null" it will + * be put in its casted type in the given model with the provided parameter + * name. If the value is null nothing happens, i.e. no error is thrown + * and no value is put in model. + */ +class StringToShogun : public AnyVisitor +{ +public: + explicit StringToShogun(std::shared_ptr model) + : m_model(model), m_parameter(""), m_string_val(""){}; + + StringToShogun( + std::shared_ptr model, const std::string& parameter, + const std::string& string_val) + : m_model(model), m_parameter(parameter), m_string_val(string_val){}; + + void on(bool* v) final + { + SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + bool result = strcmp(m_string_val.c_str(), "true") == 0; + m_model->put(m_parameter, result); + } + } + void on(int32_t* v) final + { + SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + try + { + int32_t result = std::stoi(m_string_val); + m_model->put(m_parameter, result); + } + catch (const std::invalid_argument&) + { + // it's an option, i.e. internally represented + // as an enum but in swig exposed as a string + m_string_val.erase( + std::remove_if( + m_string_val.begin(), m_string_val.end(), + // remove quotes + [](const auto& val) { return val == '\"'; }), + m_string_val.end()); + m_model->put(m_parameter, m_string_val); + } + } + } + void on(int64_t* v) final + { + SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + + int64_t result = std::stol(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(float* v) final + { + SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + float32_t result = std::stof(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(double* v) final + { + SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + float64_t result = std::stod(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(long double* v) + { + SG_SDEBUG( + "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + if (!is_null()) + { + floatmax_t result = std::stold(m_string_val); + m_model->put(m_parameter, result); + } + } + void on(CSGObject** v) final + { + SG_SDEBUG( + "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) + } + void on(SGVector* v) final + { + SG_SDEBUG( + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + } + void on(SGMatrix* mat) final + { + SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) + } + void on(SGMatrix* mat) final{SG_SDEBUG( + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} + + /** + * In OpenML "null" is an empty parameter value field. + * @return whether the field is "null" + */ + SG_FORCED_INLINE bool is_null() const noexcept + { + bool result = strcmp(m_string_val.c_str(), "null") == 0; + return result; + } + + SG_FORCED_INLINE void set_parameter_name(const std::string& name) noexcept + { + m_parameter = name; + } + + SG_FORCED_INLINE void set_string_value(const std::string& value) noexcept + { + m_string_val = value; + } + +private: + std::shared_ptr m_model; + std::string m_parameter; + std::string m_string_val; +}; + +/** + * Instantiates a CSGObject using a factory + * @param factory_name the name of the factory + * @param algo_name the name of algorithm passed to factory + * @return the instantiated object using a factory + */ +std::shared_ptr instantiate_model_from_factory( + const std::string& factory_name, const std::string& algo_name) +{ + if (factory_name == "machine") + return std::shared_ptr(machine(algo_name)); + if (factory_name == "kernel") + return std::shared_ptr(kernel(algo_name)); + if (factory_name == "distance") + return std::shared_ptr(distance(algo_name)); + + SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str()) + + return nullptr; +} + +/** + * Downcasts a CSGObject and puts it in the map of obj. + * @param obj the main object + * @param nested_obj the object to be casted and put in the obj map. + * @param parameter_name the name of nested_obj + */ +void cast_and_put( + const std::shared_ptr& obj, + const std::shared_ptr& nested_obj, + const std::string& parameter_name) +{ + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + // TODO: remove clone + // temporary fix until shared_ptr PR merged + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + return; + } + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + return; + } + if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) + { + auto* tmp_clone = dynamic_cast(casted_obj->clone()); + obj->put(parameter_name, tmp_clone); + return; + } + SG_SERROR("Could not cast SGObject.\n") +} + +std::shared_ptr ShogunOpenML::flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults) +{ + auto params = flow->get_parameters(); + auto components = flow->get_components(); + auto class_name = get_class_info(flow->get_class_name()); + auto module_name = class_name.first; + auto algo_name = class_name.second; + + auto obj = instantiate_model_from_factory(module_name, algo_name); + auto obj_param = obj->get_params(); + + auto visitor = std::make_unique(obj); + + if (initialize_with_defaults) + { + for (const auto& param : params) + { + Any any_val = obj_param.at(param.first)->get_value(); + std::string name = param.first; + std::string val_as_string = param.second.at("default_value"); + visitor->set_parameter_name(name); + visitor->set_string_value(val_as_string); + any_val.visit(visitor.get()); + } + } + + for (const auto& component : components) + { + std::shared_ptr nested_obj = + flow_to_model(component.second, initialize_with_defaults); + cast_and_put(obj, nested_obj, component.first); + } + + SG_SDEBUG("Final object: %s.\n", obj->to_string().c_str()); + + return obj; +} + +std::shared_ptr +ShogunOpenML::model_to_flow(const std::shared_ptr& model) +{ + return std::shared_ptr(); +} + +std::pair +ShogunOpenML::get_class_info(const std::string& class_name) +{ + std::vector class_components; + auto begin = class_name.begin(); + std::pair result; + + for (auto it = class_name.begin(); it != class_name.end(); ++it) + { + if (*it == '.') + { + class_components.emplace_back(std::string(begin, it)); + begin = std::next(it); + } + if (std::next(it) == class_name.end()) + class_components.emplace_back(std::string(begin, std::next(it))); + } + + if (class_components[0] == "shogun" && class_components.size() == 3) + result = std::make_pair(class_components[1], class_components[2]); + else if (class_components[0] == "shogun" && class_components.size() != 3) + SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) + else + SG_SERROR( + "The provided flow is not meant for shogun deserialisation! The " + "required library is \"%s\".\n", + class_components[0].c_str()) + + return result; +} + +std::shared_ptr ShogunOpenML::run_model_on_fold( + const std::shared_ptr& model, + const std::shared_ptr& task, + const std::shared_ptr& X_train, index_t repeat_number, + index_t fold_number, const std::shared_ptr& y_train, + const std::shared_ptr& X_test) +{ + auto task_type = task->get_task_type(); + auto model_clone = std::shared_ptr(model->clone()); + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + if (auto machine = std::dynamic_pointer_cast(model_clone)) + { + // TODO: refactor. more useless clones until smart pointers are merged + machine->put("labels", y_train->clone()->as()); + auto tmp = X_train.get(); + machine->train(tmp); + if (X_test) + return std::shared_ptr(machine->apply(X_test.get())); + else + return std::shared_ptr(machine->apply(X_train.get())); + } + else + SG_SERROR("The provided model is not a trainable machine!\n") + } + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + return nullptr; +} \ No newline at end of file diff --git a/src/shogun/io/openml/ShogunOpenML.h b/src/shogun/io/openml/ShogunOpenML.h new file mode 100644 index 00000000000..ff5257f19b8 --- /dev/null +++ b/src/shogun/io/openml/ShogunOpenML.h @@ -0,0 +1,70 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_SHOGUNOPENML_H +#define SHOGUN_SHOGUNOPENML_H + +#include + +#include +#include + +namespace shogun +{ + class OpenMLRun; + /** + * The Shogun OpenML extension to run models from an OpenMLFlow + * and convert models to OpenMLFlow. + */ + class ShogunOpenML + { + public: + friend class OpenMLRun; + /** + * Instantiates a SGObject from an OpenMLFlow. + * + * @param flow the flow to instantiate + * @param initialize_with_defaults whether to use the default values + * specified in the flow + * @return the flow as a trainable model + */ + static std::shared_ptr flow_to_model( + std::shared_ptr flow, bool initialize_with_defaults); + + /** + * Converts a SGObject to an OpenMLFlow. + * + * @param model the model to convert + * @return the flow from the model conversion + */ + static std::shared_ptr + model_to_flow(const std::shared_ptr& model); + + protected: + static std::shared_ptr run_model_on_fold( + const std::shared_ptr& model, + const std::shared_ptr& task, + const std::shared_ptr& X_train, index_t repeat_number, + index_t fold_number, const std::shared_ptr& y_train, + const std::shared_ptr& X_test); + + private: + /** + * Helper function to extract module/factory information from the + * class name field of OpenMLFlow. Throws an error either if the + * class name field is ill formed (i.e. not + * library.module.algorithm) or if the library name is not "shogun". + * + * @param class_name the flow class_name field + * @return a tuple with the module name (factory string) and the + * algorithm name + */ + static std::pair + get_class_info(const std::string& class_name); + }; +} // namespace shogun + +#endif // SHOGUN_SHOGUNOPENML_H diff --git a/src/shogun/io/openml/utils.h b/src/shogun/io/openml/utils.h new file mode 100644 index 00000000000..a51da80857e --- /dev/null +++ b/src/shogun/io/openml/utils.h @@ -0,0 +1,131 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#ifndef SHOGUN_OPENML_UTILS_H +#define SHOGUN_OPENML_UTILS_H + +#include + +#include + +namespace shogun +{ + namespace openml_detail + { + /** + * Checks the returned response from OpenML in JSON format + * @param doc the parsed OpenML JSON format response + */ + static void + check_response(const rapidjson::Document& doc, const std::string& type) + { + if (SG_UNLIKELY(doc.HasMember("error"))) + { + const rapidjson::Value& root = doc["error"]; + SG_SERROR( + "Server error %s: %s\n", root["code"].GetString(), + root["message"].GetString()) + return; + } + REQUIRE( + doc.HasMember(type.c_str()), + "Unexpected format of OpenML %s.\n", type.c_str()); + } + + /** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericValue, i.e. string + * @param param_dict the map to write to + * @param name the name of the key + */ + static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericValue>& v, + std::unordered_map& param_dict, + const std::string& name) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); + } + + /** + * Helper function to add JSON objects as string in map + * @param v a RapidJSON GenericObject, i.e. array + * @param param_dict the map to write to + * @param name the name of the key + */ + static SG_FORCED_INLINE void emplace_string_to_map( + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v, + std::unordered_map& param_dict, + const std::string& name) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + param_dict.emplace(name, v[name.c_str()].GetString()); + else + param_dict.emplace(name, ""); + } + + template + SG_FORCED_INLINE T return_if_possible( + const std::string& name, + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v) + { + SG_SNOTIMPLEMENTED + } + + template <> + SG_FORCED_INLINE std::string return_if_possible( + const std::string& name, + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v) + { + if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) + return v[name.c_str()].GetString(); + if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) + SG_SERROR( + "Found member \"%s\" but it is not a string", name.c_str()) + if (!v.HasMember(name.c_str())) + return ""; + SG_SERROR( + "\"%s\" is not a member of the given object", name.c_str()) + return nullptr; + } + + template <> + SG_FORCED_INLINE std::vector + return_if_possible>( + const std::string& name, + const rapidjson::GenericObject< + true, rapidjson::GenericValue>>& v) + { + std::vector result; + if (!v.HasMember(name.c_str())) + SG_SERROR( + "\"%s\" is not a member of the given object", name.c_str()) + if (v[name.c_str()].IsString()) + { + result.emplace_back(v[name.c_str()].GetString()); + } + if (v[name.c_str()].IsArray()) + { + for (const auto& val : v[name.c_str()].GetArray()) + { + if (val.IsString()) + result.emplace_back(val.GetString()); + else + SG_SERROR( + "Found non string member in \"%s\".\n", + name.c_str()) + } + } + return result; + } + } // namespace openml_detail +} // namespace shogun +#endif // SHOGUN_OPENML_UTILS_H From f9f9c79abb86eb05565c9b6b2cc9acaacb8ba92b Mon Sep 17 00:00:00 2001 From: gf712 Date: Sat, 18 May 2019 18:18:11 +0100 Subject: [PATCH 28/32] train and test index getter --- src/shogun/io/openml/OpenMLReader.cpp | 2 +- src/shogun/io/openml/OpenMLSplit.cpp | 70 +++++++++++------------- src/shogun/io/openml/OpenMLSplit.h | 76 ++++++++++++++++++++++----- src/shogun/io/openml/OpenMLTask.cpp | 23 +++++--- src/shogun/io/openml/OpenMLTask.h | 8 +-- src/shogun/io/openml/utils.h | 6 ++- 6 files changed, 121 insertions(+), 64 deletions(-) diff --git a/src/shogun/io/openml/OpenMLReader.cpp b/src/shogun/io/openml/OpenMLReader.cpp index bd9d1b7de34..3751e686d19 100644 --- a/src/shogun/io/openml/OpenMLReader.cpp +++ b/src/shogun/io/openml/OpenMLReader.cpp @@ -51,7 +51,7 @@ const char* OpenMLReader::flow_file = "/flow/{}"; /* TASK API */ const char* OpenMLReader::task_file = "/task/{}"; /* SPLIT API */ -const char* OpenMLReader::get_split = "/get/{}"; +const char* OpenMLReader::get_split = ""; const std::unordered_map OpenMLReader::m_format_options = {{"xml", xml_server}, diff --git a/src/shogun/io/openml/OpenMLSplit.cpp b/src/shogun/io/openml/OpenMLSplit.cpp index c4481a860cd..19569137e0d 100644 --- a/src/shogun/io/openml/OpenMLSplit.cpp +++ b/src/shogun/io/openml/OpenMLSplit.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -17,16 +18,16 @@ std::shared_ptr OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) { auto reader = OpenMLReader(api_key); - auto return_string = reader.get("get_split", "split", split_url); + auto return_string = reader.get(split_url); if (return_string == "Task not providing datasplits.") return std::make_shared(); - auto return_stream = std::istringstream(return_string); - // TODO: add ARFF parsing here - // get train/test indices - // TODO: replace line below with ARFFDeserialiser::get_features() - auto arff_features = std::make_shared(); + std::shared_ptr return_stream = + std::make_shared(return_string); + auto arff_parser = ARFFDeserializer(return_stream); + arff_parser.read(); + auto arff_features = arff_parser.get_features(); REQUIRE( arff_features->get_num_feature_obj() == 4, "Expected a ARFF file with 4 attributes: type, rowid, repeat and " @@ -41,30 +42,33 @@ OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) auto fold_feat = std::shared_ptr(arff_features->get_feature_obj(3)); - auto type_vector = string_feature_to_vector(train_test_feat); + auto type_vector = nominal_feature_to_vector(train_test_feat); auto rowid_vector = dense_feature_to_vector(rowid_feat); auto repeat_vector = dense_feature_to_vector(repeat_feat); auto fold_vector = dense_feature_to_vector(fold_feat); - std::vector> train_idx, test_idx; - for (int i = 0; i < arff_features->get_num_vectors(); ++i) + std::array, 3> train_idx, test_idx; + + for (int i = 0; i < train_test_feat->get_num_vectors(); ++i) { if (type_vector[i] == LabelType::TRAIN) - train_idx.emplace_back(std::initializer_list{ - static_cast(rowid_vector[i]), - static_cast(repeat_vector[i]), - static_cast(fold_vector[i])}); + { + train_idx[0].push_back(rowid_vector[i]); + train_idx[1].push_back(repeat_vector[i]); + train_idx[2].push_back(fold_vector[i]); + } else - test_idx.emplace_back(std::initializer_list{ - static_cast(rowid_vector[i]), - static_cast(repeat_vector[i]), - static_cast(fold_vector[i])}); + { + test_idx[0].push_back(rowid_vector[i]); + test_idx[1].push_back(repeat_vector[i]); + test_idx[2].push_back(fold_vector[i]); + } } return std::make_shared(train_idx, test_idx); } -SGVector +SGMatrix OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) { auto casted_feat = @@ -73,40 +77,28 @@ OpenMLSplit::dense_feature_to_vector(const std::shared_ptr& feat) if (!casted_feat) SG_SERROR("Error casting a column in the split file from CFeatures to " "CDenseFeatures!\n>"); - return casted_feat->get_feature_vector(0); + return casted_feat->get_feature_matrix(); } std::vector -OpenMLSplit::string_feature_to_vector(const std::shared_ptr& feat) +OpenMLSplit::nominal_feature_to_vector(const std::shared_ptr& feat) { - auto casted_feat = std::dynamic_pointer_cast>(feat); + auto casted_feat = + std::dynamic_pointer_cast>(feat); // this should never happen if (!casted_feat) SG_SERROR("Error casting a column in the split file from CFeatures to " - "CStringFeatures!\n"); - - auto to_lower = [](const std::string& line) { - std::string result; - std::transform( - line.begin(), line.end(), std::back_inserter(result), - [](uint8_t val) { return std::tolower(val); }); - return result; - }; + "CDenseFeatures!\n"); std::vector result; - for (int i = 0; i < casted_feat->get_num_vectors(); ++i) + for (const auto& el : casted_feat->get_feature_matrix()) { - auto row = casted_feat->get_feature_vector(i); - std::string label(1, row[0]); - for (auto j = 1; j < casted_feat->get_max_vector_length(); ++j) - label.append(1, row[j]); - if (to_lower(label) == "train") + if (el == 0) result.push_back(LabelType::TRAIN); - else if (to_lower(label) == "test") - result.push_back(LabelType::TEST); else - SG_SERROR("Unknown label type in split file %s!\n", label.c_str()) + result.push_back(LabelType::TEST); } + return result; } \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLSplit.h b/src/shogun/io/openml/OpenMLSplit.h index ab61aebdd77..f032ee91647 100644 --- a/src/shogun/io/openml/OpenMLSplit.h +++ b/src/shogun/io/openml/OpenMLSplit.h @@ -10,6 +10,8 @@ #include #include +#include + namespace shogun { /** @@ -28,44 +30,92 @@ namespace shogun * Default constructor. This is used when there are no * train or test indices. */ - OpenMLSplit() = default; + OpenMLSplit() : m_repeat_count(0), m_fold_count(0) + { + } OpenMLSplit( - std::vector> train_idx, - std::vector> test_idx) - : m_train_idx(std::move(train_idx)), m_test_idx(std::move(test_idx)) + const std::array, 3>& train_idx, + const std::array, 3>& test_idx) + : m_train_idx(train_idx), m_test_idx(test_idx) { + // repeats and folds are zero indexed so add 1 + // we also assume that the repeats and folds indices go from + // 0,1,...,N in increments of 1 + m_data_count = + std::make_pair(train_idx[0].size(), test_idx[0].size()); + m_repeat_count = + *std::max_element(train_idx[1].begin(), train_idx[1].end()) + 1; + m_fold_count = + *std::max_element(train_idx[2].begin(), train_idx[2].end()) + 1; + auto test_repeat_count = + *std::max_element(test_idx[1].begin(), test_idx[1].end()) + 1; + auto test_fold_count = + *std::max_element(test_idx[2].begin(), test_idx[2].end()) + 1; + + REQUIRE( + train_idx[0].size() == train_idx[1].size() && + train_idx[0].size() == train_idx[2].size(), + "All dimensions in train_idx must match!\n") + REQUIRE( + test_idx[0].size() == test_idx[1].size() && + test_idx[0].size() == test_idx[2].size(), + "All dimensions in test_idx must match!\n") + + if (m_repeat_count != test_repeat_count) + SG_SERROR( + "Expected the train and test set to have the same number " + "of repeats, but got %d and %d respectively.\n", + m_repeat_count, test_repeat_count) + if (m_repeat_count != test_repeat_count) + SG_SERROR( + "Expected the train and test set to have the same number " + "of folds, but got %d and %d respectively.\n", + m_fold_count, test_fold_count) } static std::shared_ptr get_split(const std::string& split_url, const std::string& api_key); - SG_FORCED_INLINE std::vector> get_train_idx() const - noexcept + SG_FORCED_INLINE std::array, 3> + get_train_idx() const noexcept { return m_train_idx; } - SG_FORCED_INLINE std::vector> get_test_idx() const - noexcept + SG_FORCED_INLINE std::array, 3> + get_test_idx() const noexcept { return m_test_idx; } SG_FORCED_INLINE bool contains_splits() const noexcept { - return !m_train_idx.empty() && !m_test_idx.empty(); + return !m_train_idx[0].empty() && !m_test_idx[0].empty(); + } + + SG_FORCED_INLINE int32_t get_num_repeats() const noexcept + { + return m_repeat_count; + } + + SG_FORCED_INLINE int32_t get_num_folds() const noexcept + { + return m_fold_count; } private: - static SGVector + static SGMatrix dense_feature_to_vector(const std::shared_ptr& feat); static std::vector - string_feature_to_vector(const std::shared_ptr& feat); + nominal_feature_to_vector(const std::shared_ptr& feat); - std::vector> m_train_idx; - std::vector> m_test_idx; + std::array, 3> m_train_idx; + std::array, 3> m_test_idx; + std::pair m_data_count; + int32_t m_repeat_count; + int32_t m_fold_count; }; } // namespace shogun #endif // SHOGUN_OPENMLSPLIT_H diff --git a/src/shogun/io/openml/OpenMLTask.cpp b/src/shogun/io/openml/OpenMLTask.cpp index 493664a8367..f1382d3225e 100644 --- a/src/shogun/io/openml/OpenMLTask.cpp +++ b/src/shogun/io/openml/OpenMLTask.cpp @@ -118,20 +118,31 @@ OpenMLTask::get_task_from_string(const std::string& task_type) SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) } -std::vector> OpenMLTask::get_train_indices() const +std::vector>> +OpenMLTask::get_train_indices() const { return get_indices(m_split->get_train_idx()); } -std::vector> OpenMLTask::get_test_indices() const +std::vector>> +OpenMLTask::get_test_indices() const { return get_indices(m_split->get_test_idx()); } -std::vector> -OpenMLTask::get_indices(const std::vector>& idx) const +std::vector>> +OpenMLTask::get_indices(const std::array, 3>& idx) const { - SG_SNOTIMPLEMENTED - std::vector> result; + // result = (n_repeats, n_folds, ?) where is the number of indices in a + // given fold + std::vector>> result( + m_split->get_num_repeats(), + std::vector>( + m_split->get_num_folds(), std::vector{})); + for (int i = 0; i < idx[0].size(); ++i) + { + // result[repeat][fold].push_back(data_index) + result[idx[1][i]][idx[2][i]].push_back(idx[0][i]); + } return result; } \ No newline at end of file diff --git a/src/shogun/io/openml/OpenMLTask.h b/src/shogun/io/openml/OpenMLTask.h index a849ea37de9..7115b87bf71 100644 --- a/src/shogun/io/openml/OpenMLTask.h +++ b/src/shogun/io/openml/OpenMLTask.h @@ -62,9 +62,9 @@ namespace shogun return m_split; } - std::vector> get_train_indices() const; + std::vector>> get_train_indices() const; - std::vector> get_test_indices() const; + std::vector>> get_test_indices() const; #ifndef SWIG SG_FORCED_INLINE TaskType @@ -79,8 +79,8 @@ namespace shogun private: static TaskType get_task_from_string(const std::string& task_type); - std::vector> - get_indices(const std::vector>& idx) const; + std::vector>> + get_indices(const std::array, 3>& idx) const; std::string m_task_id; std::string m_task_name; diff --git a/src/shogun/io/openml/utils.h b/src/shogun/io/openml/utils.h index a51da80857e..122ce170098 100644 --- a/src/shogun/io/openml/utils.h +++ b/src/shogun/io/openml/utils.h @@ -44,10 +44,14 @@ namespace shogun static SG_FORCED_INLINE void emplace_string_to_map( const rapidjson::GenericValue>& v, std::unordered_map& param_dict, - const std::string& name) + const std::string& name, bool required = false) { if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) param_dict.emplace(name, v[name.c_str()].GetString()); + else if (required) + SG_SERROR( + "The field \"%s\" is expected to be a string!\n", + name.c_str()) else param_dict.emplace(name, ""); } From 37e8c638226748761cd247e9ec6095e0e0643b29 Mon Sep 17 00:00:00 2001 From: gf712 Date: Mon, 20 May 2019 08:45:05 +0100 Subject: [PATCH 29/32] added crossvalidation --- src/shogun/io/openml/OpenMLData.cpp | 174 ++++++++++++++------------ src/shogun/io/openml/OpenMLData.h | 3 + src/shogun/io/openml/OpenMLRun.cpp | 143 ++++++++++++++++++--- src/shogun/io/openml/OpenMLTask.cpp | 4 +- src/shogun/io/openml/OpenMLTask.h | 14 ++- src/shogun/io/openml/ShogunOpenML.cpp | 2 +- 6 files changed, 242 insertions(+), 98 deletions(-) diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp index 66dc3dcf482..a85a166ad78 100644 --- a/src/shogun/io/openml/OpenMLData.cpp +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -32,55 +32,55 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) const Value& dataset_description = document["data_set_description"]; auto name = return_if_possible( - "name", dataset_description.GetObject()); + "name", dataset_description.GetObject()); auto description = return_if_possible( - "description", dataset_description.GetObject()); + "description", dataset_description.GetObject()); auto data_format = return_if_possible( - "data_format", dataset_description.GetObject()); + "data_format", dataset_description.GetObject()); auto dataset_id = - return_if_possible("id", dataset_description.GetObject()); + return_if_possible("id", dataset_description.GetObject()); auto version = return_if_possible( - "version", dataset_description.GetObject()); + "version", dataset_description.GetObject()); auto creator = return_if_possible( - "creator", dataset_description.GetObject()); + "creator", dataset_description.GetObject()); auto contributor = return_if_possible( - "contributor", dataset_description.GetObject()); + "contributor", dataset_description.GetObject()); auto collection_date = return_if_possible( - "collection_date", dataset_description.GetObject()); + "collection_date", dataset_description.GetObject()); auto upload_date = return_if_possible( - "upload_date", dataset_description.GetObject()); + "upload_date", dataset_description.GetObject()); auto language = return_if_possible( - "language", dataset_description.GetObject()); + "language", dataset_description.GetObject()); auto licence = return_if_possible( - "licence", dataset_description.GetObject()); + "licence", dataset_description.GetObject()); auto url = - return_if_possible("url", dataset_description.GetObject()); + return_if_possible("url", dataset_description.GetObject()); auto default_target_attribute = return_if_possible( - "default_target_attribute", dataset_description.GetObject()); + "default_target_attribute", dataset_description.GetObject()); auto row_id_attribute = return_if_possible( - "row_id_attribute", dataset_description.GetObject()); + "row_id_attribute", dataset_description.GetObject()); auto ignore_attribute = return_if_possible( - "ignore_attribute", dataset_description.GetObject()); + "ignore_attribute", dataset_description.GetObject()); auto version_label = return_if_possible( - "version_label", dataset_description.GetObject()); + "version_label", dataset_description.GetObject()); auto citation = return_if_possible( - "citation", dataset_description.GetObject()); + "citation", dataset_description.GetObject()); auto tags = return_if_possible>( - "tag", dataset_description.GetObject()); + "tag", dataset_description.GetObject()); auto visibility = return_if_possible( - "visibility", dataset_description.GetObject()); + "visibility", dataset_description.GetObject()); auto original_data_url = return_if_possible( - "original_data_url", dataset_description.GetObject()); + "original_data_url", dataset_description.GetObject()); auto paper_url = return_if_possible( - "paper_url", dataset_description.GetObject()); + "paper_url", dataset_description.GetObject()); auto update_comment = return_if_possible( - "update_comment", dataset_description.GetObject()); + "update_comment", dataset_description.GetObject()); auto md5_checksum = return_if_possible( - "md5_checksum", dataset_description.GetObject()); + "md5_checksum", dataset_description.GetObject()); // features std::vector>> - param_vector; + param_vector; return_string = reader.get("data_features", "json", id); document.Parse(return_string.c_str()); check_response(document, "data_features"); @@ -115,8 +115,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) { if (param_quality.name.IsString() && param_quality.value.IsString()) param_map.emplace( - param_quality.name.GetString(), - param_quality.value.GetString()); + param_quality.name.GetString(), + param_quality.value.GetString()); else if (param_quality.name.IsString()) param_map.emplace(param_quality.name.GetString(), ""); } @@ -124,11 +124,11 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) } auto result = std::make_shared( - name, description, data_format, dataset_id, version, creator, - contributor, collection_date, upload_date, language, licence, url, - default_target_attribute, row_id_attribute, ignore_attribute, - version_label, citation, tags, visibility, original_data_url, paper_url, - update_comment, md5_checksum, param_vector, qualities_vector); + name, description, data_format, dataset_id, version, creator, + contributor, collection_date, upload_date, language, licence, url, + default_target_attribute, row_id_attribute, ignore_attribute, + version_label, citation, tags, visibility, original_data_url, paper_url, + update_comment, md5_checksum, param_vector, qualities_vector); result->set_api_key(api_key); return result; } @@ -145,11 +145,9 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) if (!m_cached_features) get_data(); auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label); + std::find(m_feature_names.begin(), m_feature_names.end(), label); if (find_label == m_feature_names.end()) SG_SERROR("Requested label \"%s\" not in the dataset!\n", label.c_str()) - if (!m_cached_features) - get_data(); auto col_idx = std::distance(m_feature_names.begin(), find_label); auto feat_type_copy = m_feature_types; feat_type_copy.erase(feat_type_copy.begin() + col_idx); @@ -158,84 +156,102 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) if (type == ARFFDeserializer::Attribute::STRING) SG_SERROR("Currently cannot process string features!\n") } - std::shared_ptr result; + // auto result = std::make_shared(); + std::shared_ptr> result; bool first = true; for (int i = 0; i < m_feature_types.size(); ++i) { if (i != col_idx && first) { - result.reset(m_cached_features->get_feature_obj(i)); + result.reset(m_cached_features->get_feature_obj(i) + ->as>()); first = false; } if (i != col_idx) - result.reset(result->create_merged_copy( - m_cached_features->get_feature_obj(i))); + result.reset( + result + ->create_merged_copy(m_cached_features->get_feature_obj(i)) + ->as>()); } - std::dynamic_pointer_cast>(result)->set_num_features(m_feature_types.size()); - std::dynamic_pointer_cast>(result)->set_num_vectors(m_cached_features->get_num_vectors()); + + // need to copy data as result is only in the stack and the data + // will be gone at the end of the function + auto* copy_feat = SG_MALLOC( + float64_t, + m_feature_types.size() * m_cached_features->get_num_vectors()); + memcpy( + copy_feat, result->get_feature_matrix().data(), + m_feature_types.size() * m_cached_features->get_num_vectors()); + + result = std::make_shared>( + copy_feat, m_feature_types.size(), + m_cached_features->get_num_vectors()); return result; } std::shared_ptr OpenMLData::get_labels() { - if (!m_cached_features) - get_data(); REQUIRE( - !m_default_target_attribute.empty(), - "A default target attribute is required if no label is given!\n") + !m_default_target_attribute.empty(), + "A default target attribute is required if no label is given!\n") return get_labels(m_default_target_attribute); } std::shared_ptr OpenMLData::get_labels(const std::string& label_name) { + if (m_cached_labels && label_name == m_cached_label_name) + return m_cached_labels; + if (!m_cached_features) get_data(); + auto find_label = - std::find(m_feature_names.begin(), m_feature_names.end(), label_name); + std::find(m_feature_names.begin(), m_feature_names.end(), label_name); if (find_label == m_feature_names.end()) SG_SERROR( - "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) + "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) auto col_idx = std::distance(m_feature_names.begin(), find_label); - if (!m_cached_features) - get_data(); - auto target_label_as_feat = - std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); + std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); switch (m_feature_types[col_idx]) { - // real features - case ARFFDeserializer::Attribute::REAL: - case ARFFDeserializer::Attribute::NUMERIC: - case ARFFDeserializer::Attribute::INTEGER: - case ARFFDeserializer::Attribute::DATE: - { - auto casted_feat = std::dynamic_pointer_cast>( - target_label_as_feat); - auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); - auto labels = std::make_shared(labels_vec); - return labels; - } - break; - // nominal features - case ARFFDeserializer::Attribute::NOMINAL: + // real features + case ARFFDeserializer::Attribute::REAL: + case ARFFDeserializer::Attribute::NUMERIC: + case ARFFDeserializer::Attribute::INTEGER: + case ARFFDeserializer::Attribute::DATE: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + auto labels = std::make_shared(labels_vec); + m_cached_labels = labels; + m_cached_label_name = label_name; + return m_cached_labels; + } + break; + // nominal features + case ARFFDeserializer::Attribute::NOMINAL: + { + auto casted_feat = std::dynamic_pointer_cast>( + target_label_as_feat); + auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); + for (auto& val : labels_vec) { - auto casted_feat = std::dynamic_pointer_cast>( - target_label_as_feat); - auto labels_vec = casted_feat->get_feature_matrix().get_row_vector(0); - for(auto& val: labels_vec) - { - if (val == 0) - val = -1; - } - auto labels = std::make_shared(labels_vec); - return labels; + if (val == 0) + val = -1; } - break; - default: - SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) + auto labels = std::make_shared(labels_vec); + m_cached_labels = labels; + m_cached_label_name = label_name; + return m_cached_labels; + } + break; + default: + SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str()) } return nullptr; @@ -245,7 +261,7 @@ void OpenMLData::get_data() { auto reader = OpenMLReader(m_api_key); std::shared_ptr ss = - std::make_shared(reader.get(m_url)); + std::make_shared(reader.get(m_url)); auto parser = ARFFDeserializer(ss); parser.read(); diff --git a/src/shogun/io/openml/OpenMLData.h b/src/shogun/io/openml/OpenMLData.h index 7e21474775b..25bf54f3084 100644 --- a/src/shogun/io/openml/OpenMLData.h +++ b/src/shogun/io/openml/OpenMLData.h @@ -90,6 +90,7 @@ namespace shogun */ std::shared_ptr get_labels(); + /** * Returns the dataset labels given the label_name * @return the labels @@ -155,6 +156,8 @@ namespace shogun std::shared_ptr m_cached_features; std::vector m_feature_names; std::vector m_feature_types; + std::shared_ptr m_cached_labels; + std::string m_cached_label_name; }; } // namespace shogun diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp index 796545e10de..6c6420f54b3 100644 --- a/src/shogun/io/openml/OpenMLRun.cpp +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -4,6 +4,7 @@ * Authors: Gil Hoben */ +#include #include #include #include @@ -12,36 +13,150 @@ using namespace shogun; std::shared_ptr OpenMLRun::run_model_on_task( - std::shared_ptr model, std::shared_ptr task) + std::shared_ptr model, std::shared_ptr task) { SG_SNOTIMPLEMENTED return std::shared_ptr(); } std::shared_ptr OpenMLRun::run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task) + std::shared_ptr flow, std::shared_ptr task) { auto data = task->get_dataset(); - std::shared_ptr train_features = nullptr, test_features = nullptr; - std::shared_ptr train_labels = nullptr, test_labels = nullptr; + std::shared_ptr features = nullptr; + std::shared_ptr labels = nullptr; + + auto model = ShogunOpenML::flow_to_model(std::move(flow), true); + + labels = data->get_labels(); + features = data->get_features(data->get_default_target_attribute()); + + auto storage = std::make_shared(); if (task->get_split()->contains_splits()) - SG_SNOTIMPLEMENTED + { + auto machine = std::dynamic_pointer_cast(model); + if (!machine) + { + SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") + } + auto train_idx = task->get_train_indices(); + auto test_idx = task->get_test_indices(); + + auto xval_storage = std::make_shared(); + xval_storage->set_num_folds(task->get_num_fold()); + xval_storage->set_num_runs(task->get_num_repeats()); + machine->set_store_model_features(true); + + // copied/adapted from crossvalidation + for (auto repeat_idx : range(task->get_num_repeats())) + { + for (auto fold_idx : range(task->get_num_fold())) + { + auto* fold = new CrossValidationFoldStorage(); + SG_REF(fold) + + auto cloned_machine = (CMachine*)machine->clone(); + + // TODO while these are not used through const interfaces, + // we unfortunately have to clone, even though these could be + // shared + auto features_clone = (CFeatures*)features->clone(); + auto labels_clone = (CLabels*)labels->clone(); + // auto evaluation_criterion = + // (CEvaluation*)m_evaluation_criterion->clone(); + + /* evtl. update xvalidation output class */ + fold->set_run_index(repeat_idx); + fold->set_fold_index(fold_idx); + + auto train_fold_idx = SGVector( + train_idx[repeat_idx][fold_idx].data(), + train_idx[repeat_idx][fold_idx].size(), false); + + features_clone->add_subset(train_fold_idx); + + /* set label subset for training */ + labels_clone->add_subset(train_fold_idx); + + SG_SDEBUG( + "train set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + train_fold_idx.to_string().c_str()) + + /* train machine on training features and remove subset */ + SG_SDEBUG("starting training\n") + cloned_machine->set_labels(labels_clone); + cloned_machine->train(features_clone); + SG_SDEBUG("finished training\n") + + /* evtl. update xvalidation output class */ + fold->set_train_indices(train_fold_idx); + auto fold_machine = (CMachine*)cloned_machine->clone(); + fold->set_trained_machine(fold_machine); + SG_UNREF(fold_machine) + + features_clone->remove_subset(); + labels_clone->remove_subset(); + + /* set feature subset for testing (subset method that stores + * pointer) */ + auto test_fold_idx = SGVector( + test_idx[repeat_idx][fold_idx].data(), + test_idx[repeat_idx][fold_idx].size(), false); + features_clone->add_subset(test_fold_idx); + + /* set label subset for testing */ + labels_clone->add_subset(test_fold_idx); + + SG_SDEBUG( + "test set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + test_fold_idx.to_string().c_str()) + + /* apply machine to test features and remove subset */ + SG_SDEBUG("starting evaluation\n") + SG_SDEBUG("%p\n", features_clone) + CLabels* result_labels = cloned_machine->apply(features_clone); + SG_SDEBUG("finished evaluation\n") + features_clone->remove_subset(); + SG_REF(result_labels); + + /* evaluate */ + // results[i] = + // evaluation_criterion->evaluate(result_labels, + //labels); SG_DEBUG("result on fold %d is %f\n", i, results[i]) + + /* evtl. update xvalidation output class */ + // fold->set_test_indices(test_fold_idx); + // fold->set_test_result(result_labels); + // auto* true_labels = (CLabels*)labels->clone(); + // fold->set_test_true_result(true_labels); + // SG_UNREF(true_labels) + // fold->post_update_results(); + // fold->set_evaluation_result(results[i]); + + storage->append_fold_result(fold); + // + // /* clean up, remove subsets */ + // labels->remove_subset(); + SG_UNREF(cloned_machine); + SG_UNREF(features_clone); + SG_UNREF(labels_clone); + // SG_UNREF(evaluation_criterion); + // SG_UNREF(result_labels); + SG_UNREF(fold) + } + } + } else { - train_labels = data->get_labels(); - train_features = - data->get_features(data->get_default_target_attribute()); // ensures delete is called by shared ptr destructor - SG_REF(train_labels.get()) - SG_REF(train_features.get()) - auto model = ShogunOpenML::flow_to_model(std::move(flow), true); - + SG_REF(labels.get()) + SG_REF(features.get()) if (auto machine = std::dynamic_pointer_cast(model)) { auto result = ShogunOpenML::run_model_on_fold( - machine, task, train_features, 0, 0, train_labels, - test_features); + machine, task, features, 0, 0, labels, + nullptr); SG_SDEBUG(result->to_string().c_str()); } else diff --git a/src/shogun/io/openml/OpenMLTask.cpp b/src/shogun/io/openml/OpenMLTask.cpp index f1382d3225e..c4a4c1fb00c 100644 --- a/src/shogun/io/openml/OpenMLTask.cpp +++ b/src/shogun/io/openml/OpenMLTask.cpp @@ -118,13 +118,13 @@ OpenMLTask::get_task_from_string(const std::string& task_type) SG_SERROR("OpenMLTask does not support \"%s\"", task_type.c_str()) } -std::vector>> +std::vector>> OpenMLTask::get_train_indices() const { return get_indices(m_split->get_train_idx()); } -std::vector>> +std::vector>> OpenMLTask::get_test_indices() const { return get_indices(m_split->get_test_idx()); diff --git a/src/shogun/io/openml/OpenMLTask.h b/src/shogun/io/openml/OpenMLTask.h index 7115b87bf71..9f5e97c1bef 100644 --- a/src/shogun/io/openml/OpenMLTask.h +++ b/src/shogun/io/openml/OpenMLTask.h @@ -62,9 +62,19 @@ namespace shogun return m_split; } - std::vector>> get_train_indices() const; + std::vector>> get_train_indices() const; - std::vector>> get_test_indices() const; + std::vector>> get_test_indices() const; + + int32_t get_num_fold() const noexcept + { + return m_split->get_num_folds(); + } + + int32_t get_num_repeats() const noexcept + { + return m_split->get_num_repeats(); + } #ifndef SWIG SG_FORCED_INLINE TaskType diff --git a/src/shogun/io/openml/ShogunOpenML.cpp b/src/shogun/io/openml/ShogunOpenML.cpp index 80dbc03810f..aff02d90a62 100644 --- a/src/shogun/io/openml/ShogunOpenML.cpp +++ b/src/shogun/io/openml/ShogunOpenML.cpp @@ -310,7 +310,7 @@ std::shared_ptr ShogunOpenML::run_model_on_fold( { if (auto machine = std::dynamic_pointer_cast(model_clone)) { - // TODO: refactor. more useless clones until smart pointers are merged + // TODO: refactor! more useless clones until smart pointers are merged machine->put("labels", y_train->clone()->as()); auto tmp = X_train.get(); machine->train(tmp); From f17bbe149d0644833845273673becc6b953c9e25 Mon Sep 17 00:00:00 2001 From: gf712 Date: Tue, 21 May 2019 15:52:55 +0100 Subject: [PATCH 30/32] some cleanups --- src/shogun/CMakeLists.txt | 3 + src/shogun/io/openml/OpenMLData.cpp | 28 +- src/shogun/io/openml/OpenMLFile.cpp | 117 ++++++++ .../openml/{OpenMLReader.h => OpenMLFile.h} | 15 +- src/shogun/io/openml/OpenMLFlow.cpp | 202 ++++++++++--- src/shogun/io/openml/OpenMLFlow.h | 97 +++++- src/shogun/io/openml/OpenMLReader.cpp | 97 ------ src/shogun/io/openml/OpenMLRun.cpp | 178 ++++------- src/shogun/io/openml/OpenMLRun.h | 4 +- src/shogun/io/openml/OpenMLSplit.cpp | 4 +- src/shogun/io/openml/OpenMLTask.cpp | 67 +++-- src/shogun/io/openml/OpenMLTask.h | 13 +- src/shogun/io/openml/ShogunOpenML.cpp | 280 +++++++++++++----- src/shogun/io/openml/ShogunOpenML.h | 22 +- src/shogun/io/openml/utils.h | 100 ++++--- 15 files changed, 797 insertions(+), 430 deletions(-) create mode 100644 src/shogun/io/openml/OpenMLFile.cpp rename src/shogun/io/openml/{OpenMLReader.h => OpenMLFile.h} (91%) delete mode 100644 src/shogun/io/openml/OpenMLReader.cpp diff --git a/src/shogun/CMakeLists.txt b/src/shogun/CMakeLists.txt index 82491ab2fb2..2a5049f4cbc 100644 --- a/src/shogun/CMakeLists.txt +++ b/src/shogun/CMakeLists.txt @@ -442,6 +442,9 @@ SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE ${RAPIDJSON_INCLUDE_DIRS}) include(external/bitsery) SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE ${BITSERY_INCLUDE_DIR}) +include(external/bitsery) +SHOGUN_INCLUDE_DIRS(SCOPE PRIVATE ${BITSERY_INCLUDE_DIR}) + if (NOT WIN32) # FIXME: HDF5 linking on WIN32 is broken. # at least with the hdf5 supplied in anaconda diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp index a85a166ad78..728cb6b1484 100644 --- a/src/shogun/io/openml/OpenMLData.cpp +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -9,11 +9,9 @@ #include #include -#include +#include #include -#include - using namespace shogun; using namespace shogun::openml_detail; using namespace rapidjson; @@ -22,14 +20,10 @@ std::shared_ptr OpenMLData::get_dataset(const std::string& id, const std::string& api_key) { // description - Document document; - auto reader = OpenMLReader(api_key); + auto reader = OpenMLFile(api_key); auto return_string = reader.get("dataset_description", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_set_description"); - - const Value& dataset_description = document["data_set_description"]; + auto& dataset_description = check_response(return_string, "data_set_description"); auto name = return_if_possible( "name", dataset_description.GetObject()); @@ -82,9 +76,9 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) std::vector>> param_vector; return_string = reader.get("data_features", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_features"); - const Value& dataset_features = document["data_features"]; + + auto& dataset_features = check_response(return_string, "data_features"); + for (const auto& param : dataset_features["feature"].GetArray()) { std::unordered_map> param_map; @@ -105,9 +99,9 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) // qualities std::vector> qualities_vector; return_string = reader.get("data_qualities", "json", id); - document.Parse(return_string.c_str()); - check_response(document, "data_qualities"); - const Value& data_qualities = document["data_qualities"]; + + auto& data_qualities = check_response(return_string, "data_qualities"); + for (const auto& param : data_qualities["quality"].GetArray()) { std::unordered_map param_map; @@ -154,7 +148,7 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) for (const auto type : feat_type_copy) { if (type == ARFFDeserializer::Attribute::STRING) - SG_SERROR("Currently cannot process string features!\n") + SG_SNOTIMPLEMENTED } // auto result = std::make_shared(); std::shared_ptr> result; @@ -259,7 +253,7 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) void OpenMLData::get_data() { - auto reader = OpenMLReader(m_api_key); + auto reader = OpenMLFile(m_api_key); std::shared_ptr ss = std::make_shared(reader.get(m_url)); diff --git a/src/shogun/io/openml/OpenMLFile.cpp b/src/shogun/io/openml/OpenMLFile.cpp new file mode 100644 index 00000000000..3e20e4b264f --- /dev/null +++ b/src/shogun/io/openml/OpenMLFile.cpp @@ -0,0 +1,117 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Gil Hoben + */ + +#include + +#ifdef HAVE_CURL +#include "OpenMLFile.h" +#include + +#endif // HAVE_CURL + +using namespace shogun; + +/** + * The writer callback function used to write the packets to a C++ string. + * @param data the data received in CURL request + * @param size always 1 + * @param nmemb the size of data + * @param buffer_in the buffer to write to + * @return the size of buffer that was written + */ +size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) +{ + // check that the buffer string points to something + if (buffer_in != nullptr) + { + // Append the data to the buffer + buffer_in->append(data, size * nmemb); + + return size * nmemb; + } + return 0; +} + +/* OpenML server format */ +const char* OpenMLFile::xml_server = "https://www.openml.org/api/v1/xml"; +const char* OpenMLFile::json_server = "https://www.openml.org/api/v1/json"; +const char* OpenMLFile::download_server = ""; +const char* OpenMLFile::splits_server = "https://www.openml.org/api_splits"; + +/* DATA API */ +const char* OpenMLFile::dataset_description = "/data/{}"; +const char* OpenMLFile::list_data_qualities = "/data/qualities/list"; +const char* OpenMLFile::data_features = "/data/features/{}"; +const char* OpenMLFile::data_qualities = "/data/qualities/{}"; +const char* OpenMLFile::list_dataset_qualities = "/data/qualities/{}"; +const char* OpenMLFile::list_dataset_filter = "/data/list/{}"; +/* FLOW API */ +const char* OpenMLFile::flow_file = "/flow/{}"; +const char* OpenMLFile::flow_exists = "/flow/exists/{}"; +/* TASK API */ +const char* OpenMLFile::task_file = "/task/{}"; +/* SPLIT API */ +const char* OpenMLFile::get_split = ""; + +const std::unordered_map + OpenMLFile::m_format_options = {{"xml", xml_server}, + {"json", json_server}, + {"split", splits_server}, + {"download", download_server}}; +const std::unordered_map + OpenMLFile::m_request_options = { + {"dataset_description", dataset_description}, + {"list_data_qualities", list_data_qualities}, + {"data_features", data_features}, + {"data_qualities", data_qualities}, + {"list_dataset_qualities", list_dataset_qualities}, + {"list_dataset_filter", list_dataset_filter}, + {"flow_file", flow_file}, + {"flow_exists", flow_exists}, + {"task_file", task_file}}; + +void OpenMLFile::openml_curl_request_helper(const std::string& url) +{ +#ifdef HAVE_CURL + auto curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.\n") + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); + + CURLcode res = curl_easy_perform(curl_handle); + + if (res != CURLE_OK) + SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) + + curl_easy_cleanup(curl_handle); +#endif // HAVE_CURL +} + +std::string OpenMLFile::encode_string(const std::string& s) +{ +#ifdef HAVE_CURL + auto curl_handle = curl_easy_init(); + + if (!curl_handle) + { + SG_SERROR("Failed to initialise curl handle.\n") + } + + char* encoded_url = curl_easy_escape(curl_handle, s.c_str(), s.size()); + if (!encoded_url) + SG_SERROR("Failed to encode \"%s\" URL escaped.\n", s.c_str()) + return encoded_url; +#else + return s; +#endif +} diff --git a/src/shogun/io/openml/OpenMLReader.h b/src/shogun/io/openml/OpenMLFile.h similarity index 91% rename from src/shogun/io/openml/OpenMLReader.h rename to src/shogun/io/openml/OpenMLFile.h index 8078b1e78ec..ef1796b09ff 100644 --- a/src/shogun/io/openml/OpenMLReader.h +++ b/src/shogun/io/openml/OpenMLFile.h @@ -20,11 +20,11 @@ namespace shogun /** * Reads OpenML streams which can be downloaded with this function. */ - class OpenMLReader + class OpenMLFile { public: - explicit OpenMLReader(const std::string& api_key) : m_api_key(api_key) + explicit OpenMLFile(const std::string& api_key) : m_api_key(api_key) { } @@ -93,7 +93,7 @@ namespace shogun std::string args_string = std::accumulate( args_vec.begin() + 1, args_vec.end(), args_vec.front(), [](std::string s0, std::string& s1) { - return s0 += "/" + s1; + return s0 += "/" + encode_string(s1); }); request_path += args_string; } @@ -113,11 +113,14 @@ namespace shogun return m_curl_response_buffer; #else SG_SERROR( - "This function is only available with the CURL library!\n") + "Please compile shogun with libcurl to query the OpenML server!\n") #endif // HAVE_CURL } private: + + static std::string encode_string(const std::string& s); + /** the raw buffer as a C++ string */ std::string m_curl_response_buffer; @@ -158,6 +161,7 @@ namespace shogun /* FLOW API */ static const char* flow_file; + static const char* flow_exists; /* TASK API */ static const char* task_file; @@ -174,6 +178,9 @@ namespace shogun public: OpenMLWritter(const std::string& api_key) : m_api_key(api_key){}; + template + bool post(const std::string& request, const std::string& format, const std::string& message, Args... args); + private: /** the user API key, likely to be needed to write to OpenML */ std::string m_api_key; diff --git a/src/shogun/io/openml/OpenMLFlow.cpp b/src/shogun/io/openml/OpenMLFlow.cpp index dd2f3f839a6..e36ecd251e3 100644 --- a/src/shogun/io/openml/OpenMLFlow.cpp +++ b/src/shogun/io/openml/OpenMLFlow.cpp @@ -4,9 +4,11 @@ * Authors: Gil Hoben */ -#include +#include +#include -#include +#include +#include #include #include @@ -15,50 +17,130 @@ using namespace shogun; using namespace shogun::openml_detail; using namespace rapidjson; +/** + * + */ +class ShogunToString : public AnyVisitor +{ +public: + explicit ShogunToString(){SG_SDEBUG("Debugging ShogunToString\n")}; + + void on(bool* v) final + { + m_string_val = (*v ? "true" : "false"); + } + void on(int32_t* v) final + { + m_string_val = std::to_string(*v); + } + void on(int64_t* v) final + { + m_string_val = std::to_string(*v); + } + void on(float* v) final + { + m_string_val = std::to_string(*v); + } + void on(double* v) final + { + m_string_val = std::to_string(*v); + } + void on(long double* v) + { + m_string_val = std::to_string(*v); + } + void on(CSGObject** v) final + { + m_string_val = ""; + } + void on(SGVector* v) final + { + m_string_val = v->to_string(); + } + void on(SGVector* v) final + { + m_string_val = v->to_string(); + } + void on(SGVector* v) final + { + m_string_val = v->to_string(); + } + void on(SGMatrix* mat) final + { + m_string_val = mat->to_string(); + } + void on(SGMatrix* mat) final + { + m_string_val = mat->to_string(); + } + void on(SGMatrix* mat) final + { + m_string_val = mat->to_string(); + } + + SG_FORCED_INLINE std::string get_string_value() const noexcept + { + return m_string_val; + } + +private: + std::string m_string_val; +}; std::shared_ptr OpenMLFlow::download_flow( - const std::string& flow_id, const std::string& api_key) + const std::string& flow_id, const std::string& api_key) { - Document document; parameters_type params; components_type components; - std::string name; - std::string description; - std::string class_name; // get flow and parse with RapidJSON - auto reader = OpenMLReader(api_key); + auto reader = OpenMLFile(api_key); auto return_string = reader.get("flow_file", "json", flow_id); - document.Parse(return_string.c_str()); - check_response(document, "flow"); - // store root for convenience. We know it exists from previous check. - const Value& root = document["flow"]; + auto& root = check_response(return_string, "flow"); + + std::string name = + return_if_possible("name", root.GetObject()); + std::string description = + return_if_possible("description", root.GetObject()); + std::string class_name = + return_if_possible("class_name", root.GetObject()); + std::string external_version = + return_if_possible("external_version", root.GetObject()); + + REQUIRE( + root["id"].GetString() == flow_id, + "The flow id returned by the server does not match the id provided. " + "Got %s instead of %s.\n", + root["id"].GetString(), flow_id.c_str()) // handle parameters if (root.HasMember("parameter")) { std::unordered_map param_dict; + OpenMLFlowParameter params_i{}; if (root["parameter"].IsArray()) { for (const auto& v : root["parameter"].GetArray()) { - emplace_string_to_map(v, param_dict, "data_type"); - emplace_string_to_map(v, param_dict, "default_value"); - emplace_string_to_map(v, param_dict, "description"); - params.emplace(v["name"].GetString(), param_dict); - param_dict.clear(); + add_string_to_struct(v, "name", params_i.name); + add_string_to_struct(v, "data_type", params_i.data_type); + add_string_to_struct( + v, "default_value", params_i.default_value); + add_string_to_struct(v, "description", params_i.description); + params.emplace(params_i.name, params_i); } } else { // parameter can also be a dict, instead of array - const auto v = root["parameter"].GetObject(); - emplace_string_to_map(v, param_dict, "data_type"); - emplace_string_to_map(v, param_dict, "default_value"); - emplace_string_to_map(v, param_dict, "description"); - params.emplace(v["name"].GetString(), param_dict); + const auto& v = root["parameter"].GetObject(); + add_string_to_struct(v, "name", params_i.name); + add_string_to_struct(v, "data_type", params_i.data_type); + add_string_to_struct(v, "default_value", params_i.default_value); + add_string_to_struct(v, "description", params_i.description); + params.emplace(params_i.name, params_i); } } @@ -70,30 +152,23 @@ std::shared_ptr OpenMLFlow::download_flow( for (const auto& v : root["component"].GetArray()) { components.emplace( - v["identifier"].GetString(), - OpenMLFlow::download_flow( - v["flow"]["id"].GetString(), api_key)); + must_return("identifier", v), + OpenMLFlow::download_flow( + v["flow"]["id"].GetString(), api_key)); } } else { components.emplace( - root["component"]["identifier"].GetString(), - OpenMLFlow::download_flow( - root["component"]["flow"]["id"].GetString(), api_key)); + must_return("identifier", root["component"]), + OpenMLFlow::download_flow( + root["component"]["flow"]["id"].GetString(), api_key)); } } - // get remaining information from flow - if (root.HasMember("name")) - name = root["name"].GetString(); - if (root.HasMember("description")) - description = root["description"].GetString(); - if (root.HasMember("class_name")) - class_name = root["class_name"].GetString(); - auto flow = std::make_shared( - name, description, class_name, components, params); + flow_id, name, description, class_name, external_version, components, + params); return flow; } @@ -112,4 +187,55 @@ std::shared_ptr OpenMLFlow::from_file() { SG_SNOTIMPLEMENTED; return std::shared_ptr(); -} \ No newline at end of file +} + +bool OpenMLFlow::exists_on_server() +{ + // check if flow with same name and version exists + auto reader = std::make_unique(""); + auto flow_exists_string = + reader->get("flow_exists", "json", m_name, m_external_version); + + auto& root = + check_response(flow_exists_string, "flow_exists"); + + return strcmp(root["exists"].GetString(), "true") == 0; +} + +std::vector> +OpenMLFlow::obtain_parameter_values(const std::shared_ptr& model) +{ + std::vector> result; + auto obj_param = model->get_params(); + auto visitor = std::make_unique(); + + result.reserve(m_parameters.size()); + + for (const auto& param : m_parameters) + { + Any any_val = obj_param.at(param.first)->get_value(); + any_val.visit(visitor.get()); + // nested objects are handled below + if (!visitor->get_string_value().empty()) + { + // result.emplace_back to call OpenMLParameterValues constructor + // doesn't work here, so create a temporary value with make_shared + // and then push_back + auto val = std::make_shared( + param.first, m_flow_id, visitor->get_string_value()); + result.push_back(val); + } + } + + for (const auto& components : m_components) + { + // TODO: remove std::shared_ptr when smart pointers available + auto obj = std::shared_ptr(model->get(components.first)); + auto val = std::make_shared( + components.first, m_flow_id, + components.second->obtain_parameter_values(obj)); + result.push_back(val); + } + + return result; +} diff --git a/src/shogun/io/openml/OpenMLFlow.h b/src/shogun/io/openml/OpenMLFlow.h index e8424e58343..7ff9f4ff5e0 100644 --- a/src/shogun/io/openml/OpenMLFlow.h +++ b/src/shogun/io/openml/OpenMLFlow.h @@ -7,14 +7,53 @@ #ifndef SHOGUN_OPENMLFLOW_H #define SHOGUN_OPENMLFLOW_H -#include +#include +#include #include #include - namespace shogun { + /** + * OpenML representation of a parameter + */ + struct OpenMLParameterValues + { + /** name of parameter */ + std::string name; + /** id of flow parameter belongs to */ + std::string component; + /** value of parameter or a nested object */ + shogun::variant< + std::string, std::vector>> + value; + + OpenMLParameterValues( + const std::string& name_, const std::string& component_, + shogun::variant< + std::string, + std::vector>> + value_) + : name(name_), component(component_), value(std::move(value_)) + { + } + }; + + /** + * OpenML description of a flow parameter + */ + struct OpenMLFlowParameter + { + /** name of parameter */ + std::string name; + /** primitive type of parameter */ + std::string data_type; + /** the default value, i.e. the value published to OpenML */ + std::string default_value; + /** the parameter description */ + std::string description; + }; /** * Handles OpenML flows. A flow contains the information * required to instantiate a model. @@ -28,8 +67,7 @@ namespace shogun std::unordered_map>; /** alias for parameter type, map of maps with information specific to a * parameter */ - using parameters_type = std::unordered_map< - std::string, std::unordered_map>; + using parameters_type = std::unordered_map; /** * The OpenMLFlow constructor. This constructor is rarely used by the @@ -45,10 +83,12 @@ namespace shogun * for each parameter name */ OpenMLFlow( - const std::string& name, const std::string& description, - const std::string& model, components_type components, + const std::string& flow_id, const std::string& name, + const std::string& description, const std::string& model, + const std::string& external_version, components_type components, parameters_type parameters) - : m_name(name), m_description(description), m_class_name(model), + : m_flow_id(flow_id), m_name(name), m_description(description), + m_class_name(model), m_external_version(external_version), m_parameters(std::move(parameters)), m_components(std::move(components)) { @@ -85,6 +125,11 @@ namespace shogun */ void dump() const; + bool exists_on_server(); + + std::vector> + obtain_parameter_values(const std::shared_ptr& model); + /** * Gets a subflow, i.e. a kernel in a machine * @param name the name of the subflow, not the flow ID @@ -102,40 +147,60 @@ namespace shogun } #ifndef SWIG - SG_FORCED_INLINE parameters_type - - get_parameters() const noexcept + SG_FORCED_INLINE parameters_type get_parameters() const noexcept { return m_parameters; } - SG_FORCED_INLINE components_type - - get_components() const noexcept + SG_FORCED_INLINE components_type get_components() const noexcept { return m_components; } - SG_FORCED_INLINE std::string - - get_class_name() const noexcept + SG_FORCED_INLINE std::string get_class_name() const noexcept { return m_class_name; } + SG_FORCED_INLINE std::string get_version() const noexcept + { + return m_external_version; + } + + SG_FORCED_INLINE std::string get_name() const noexcept + { + return m_name; + } + + SG_FORCED_INLINE std::string get_flow_id() const noexcept + { + return m_flow_id; + } + + SG_FORCED_INLINE void set_model(std::shared_ptr model) noexcept + { + m_model = model; + } + #endif // SWIG private: + /** flow unique id */ + std::string m_flow_id; /** name field of the flow */ std::string m_name; /** description field of the flow */ std::string m_description; /** the class_name field of the flow */ std::string m_class_name; + /** the external version, e.g. "shogun=7.0.0" */ + std::string m_external_version; /** the parameter field of the flow (optional) */ parameters_type m_parameters; /** the components fields of the flow (optional) */ components_type m_components; + /** the associated model (optional) */ + std::shared_ptr m_model; }; } // namespace shogun diff --git a/src/shogun/io/openml/OpenMLReader.cpp b/src/shogun/io/openml/OpenMLReader.cpp deleted file mode 100644 index 3751e686d19..00000000000 --- a/src/shogun/io/openml/OpenMLReader.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Gil Hoben - */ - -#include - -#ifdef HAVE_CURL -#include -#endif // HAVE_CURL - -using namespace shogun; - -/** - * The writer callback function used to write the packets to a C++ string. - * @param data the data received in CURL request - * @param size always 1 - * @param nmemb the size of data - * @param buffer_in the buffer to write to - * @return the size of buffer that was written - */ -size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in) -{ - // check that the buffer string points to something - if (buffer_in != nullptr) - { - // Append the data to the buffer - buffer_in->append(data, size * nmemb); - - return size * nmemb; - } - return 0; -} - -/* OpenML server format */ -const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml"; -const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json"; -const char* OpenMLReader::download_server = ""; -const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits"; - -/* DATA API */ -const char* OpenMLReader::dataset_description = "/data/{}"; -const char* OpenMLReader::list_data_qualities = "/data/qualities/list"; -const char* OpenMLReader::data_features = "/data/features/{}"; -const char* OpenMLReader::data_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}"; -const char* OpenMLReader::list_dataset_filter = "/data/list/{}"; -/* FLOW API */ -const char* OpenMLReader::flow_file = "/flow/{}"; -/* TASK API */ -const char* OpenMLReader::task_file = "/task/{}"; -/* SPLIT API */ -const char* OpenMLReader::get_split = ""; - -const std::unordered_map - OpenMLReader::m_format_options = {{"xml", xml_server}, - {"json", json_server}, - {"split", splits_server}, - {"download", download_server}}; -const std::unordered_map - OpenMLReader::m_request_options = { - {"dataset_description", dataset_description}, - {"list_data_qualities", list_data_qualities}, - {"data_features", data_features}, - {"data_qualities", data_qualities}, - {"list_dataset_qualities", list_dataset_qualities}, - {"list_dataset_filter", list_dataset_filter}, - {"flow_file", flow_file}, - {"task_file", task_file}}; - -void OpenMLReader::openml_curl_request_helper(const std::string& url) -{ -#ifdef HAVE_CURL - CURL* curl_handle = nullptr; - - curl_handle = curl_easy_init(); - - if (!curl_handle) - { - SG_SERROR("Failed to initialise curl handle.\n") - return; - } - - curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl_handle, CURLOPT_HTTPGET, 1); - curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer); - curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &m_curl_response_buffer); - - CURLcode res = curl_easy_perform(curl_handle); - - if (res != CURLE_OK) - SG_SERROR("Connection error: %s.\n", curl_easy_strerror(res)) - - curl_easy_cleanup(curl_handle); -#endif // HAVE_CURL -} diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp index 6c6420f54b3..465c5045c21 100644 --- a/src/shogun/io/openml/OpenMLRun.cpp +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -5,12 +5,16 @@ */ #include +#include #include #include +#include #include #include using namespace shogun; +using namespace shogun::openml_detail; +using namespace rapidjson; std::shared_ptr OpenMLRun::run_model_on_task( std::shared_ptr model, std::shared_ptr task) @@ -20,149 +24,87 @@ std::shared_ptr OpenMLRun::run_model_on_task( } std::shared_ptr OpenMLRun::run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task) + std::shared_ptr flow, std::shared_ptr task, + bool avoid_duplicate_runs) { + if (avoid_duplicate_runs && flow->exists_on_server()) + { + auto flow_from_server = + OpenMLFlow::download_flow(flow->get_flow_id(), ""); + } + auto data = task->get_dataset(); - std::shared_ptr features = nullptr; - std::shared_ptr labels = nullptr; - auto model = ShogunOpenML::flow_to_model(std::move(flow), true); + auto model = ShogunOpenML::flow_to_model(flow, true); + flow->set_model(model); - labels = data->get_labels(); - features = data->get_features(data->get_default_target_attribute()); + auto labels = data->get_labels(); + auto features = data->get_features(data->get_default_target_attribute()); + + auto machine = std::dynamic_pointer_cast(model); + if (!machine) + { + SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") + } - auto storage = std::make_shared(); + auto* xval_storage = new CrossValidationStorage(); if (task->get_split()->contains_splits()) { - auto machine = std::dynamic_pointer_cast(model); - if (!machine) - { - SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") - } auto train_idx = task->get_train_indices(); auto test_idx = task->get_test_indices(); - auto xval_storage = std::make_shared(); - xval_storage->set_num_folds(task->get_num_fold()); xval_storage->set_num_runs(task->get_num_repeats()); + xval_storage->set_num_folds(task->get_num_fold()); + machine->set_store_model_features(true); - // copied/adapted from crossvalidation for (auto repeat_idx : range(task->get_num_repeats())) { for (auto fold_idx : range(task->get_num_fold())) { - auto* fold = new CrossValidationFoldStorage(); - SG_REF(fold) - - auto cloned_machine = (CMachine*)machine->clone(); - - // TODO while these are not used through const interfaces, - // we unfortunately have to clone, even though these could be - // shared - auto features_clone = (CFeatures*)features->clone(); - auto labels_clone = (CLabels*)labels->clone(); - // auto evaluation_criterion = - // (CEvaluation*)m_evaluation_criterion->clone(); - - /* evtl. update xvalidation output class */ - fold->set_run_index(repeat_idx); - fold->set_fold_index(fold_idx); - - auto train_fold_idx = SGVector( + SGVector train_i_idx( train_idx[repeat_idx][fold_idx].data(), - train_idx[repeat_idx][fold_idx].size(), false); - - features_clone->add_subset(train_fold_idx); - - /* set label subset for training */ - labels_clone->add_subset(train_fold_idx); - - SG_SDEBUG( - "train set repeat %d fold %d: %s\n", repeat_idx, fold_idx, - train_fold_idx.to_string().c_str()) - - /* train machine on training features and remove subset */ - SG_SDEBUG("starting training\n") - cloned_machine->set_labels(labels_clone); - cloned_machine->train(features_clone); - SG_SDEBUG("finished training\n") - - /* evtl. update xvalidation output class */ - fold->set_train_indices(train_fold_idx); - auto fold_machine = (CMachine*)cloned_machine->clone(); - fold->set_trained_machine(fold_machine); - SG_UNREF(fold_machine) - - features_clone->remove_subset(); - labels_clone->remove_subset(); - - /* set feature subset for testing (subset method that stores - * pointer) */ - auto test_fold_idx = SGVector( - test_idx[repeat_idx][fold_idx].data(), - test_idx[repeat_idx][fold_idx].size(), false); - features_clone->add_subset(test_fold_idx); - - /* set label subset for testing */ - labels_clone->add_subset(test_fold_idx); - - SG_SDEBUG( - "test set repeat %d fold %d: %s\n", repeat_idx, fold_idx, - test_fold_idx.to_string().c_str()) - - /* apply machine to test features and remove subset */ - SG_SDEBUG("starting evaluation\n") - SG_SDEBUG("%p\n", features_clone) - CLabels* result_labels = cloned_machine->apply(features_clone); - SG_SDEBUG("finished evaluation\n") - features_clone->remove_subset(); - SG_REF(result_labels); - - /* evaluate */ - // results[i] = - // evaluation_criterion->evaluate(result_labels, - //labels); SG_DEBUG("result on fold %d is %f\n", i, results[i]) - - /* evtl. update xvalidation output class */ - // fold->set_test_indices(test_fold_idx); - // fold->set_test_result(result_labels); - // auto* true_labels = (CLabels*)labels->clone(); - // fold->set_test_true_result(true_labels); - // SG_UNREF(true_labels) - // fold->post_update_results(); - // fold->set_evaluation_result(results[i]); - - storage->append_fold_result(fold); - // - // /* clean up, remove subsets */ - // labels->remove_subset(); - SG_UNREF(cloned_machine); - SG_UNREF(features_clone); - SG_UNREF(labels_clone); - // SG_UNREF(evaluation_criterion); - // SG_UNREF(result_labels); - SG_UNREF(fold) + train_idx[repeat_idx][fold_idx].size()); + SGVector test_i_idx( + train_idx[repeat_idx][fold_idx].data(), + train_idx[repeat_idx][fold_idx].size()); + xval_storage->append_fold_result( + ShogunOpenML::run_model_on_fold( + machine, task, features, labels, train_i_idx, + test_i_idx, repeat_idx, fold_idx) + .release()); } } } else { - // ensures delete is called by shared ptr destructor - SG_REF(labels.get()) - SG_REF(features.get()) - if (auto machine = std::dynamic_pointer_cast(model)) - { - auto result = ShogunOpenML::run_model_on_fold( - machine, task, features, 0, 0, labels, - nullptr); - SG_SDEBUG(result->to_string().c_str()); - } - else - SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") + xval_storage->set_num_runs(0); + xval_storage->set_num_folds(0); + xval_storage->append_fold_result( + ShogunOpenML::run_model_on_fold(machine, task, features, labels) + .release()); } - return std::shared_ptr(); + SG_SDEBUG("End of openml run: %s\n", xval_storage->to_string().c_str()); + + return std::make_shared( + nullptr, // uploader + nullptr, // uploader_name + nullptr, // setup_id + nullptr, // setup_string + nullptr, // parameter_settings + std::vector{}, // evaluations + std::vector{}, // fold_evaluations + std::vector{}, // sample_evaluations + nullptr, // data_content + std::vector{}, // output_files + task, // task + flow, // flow + nullptr, // run_id + model, // model + std::vector{}, // tags + nullptr // predictions_url + ); } std::shared_ptr diff --git a/src/shogun/io/openml/OpenMLRun.h b/src/shogun/io/openml/OpenMLRun.h index 9046d09bea9..3619bed62d1 100644 --- a/src/shogun/io/openml/OpenMLRun.h +++ b/src/shogun/io/openml/OpenMLRun.h @@ -46,7 +46,9 @@ namespace shogun { from_filesystem(const std::string& directory); static std::shared_ptr run_flow_on_task( - std::shared_ptr flow, std::shared_ptr task); + std::shared_ptr flow, + std::shared_ptr task, + bool avoid_duplicate_runs=true); static std::shared_ptr run_model_on_task( std::shared_ptr model, std::shared_ptr task); diff --git a/src/shogun/io/openml/OpenMLSplit.cpp b/src/shogun/io/openml/OpenMLSplit.cpp index 19569137e0d..42d14520168 100644 --- a/src/shogun/io/openml/OpenMLSplit.cpp +++ b/src/shogun/io/openml/OpenMLSplit.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include #include using namespace shogun; @@ -17,7 +17,7 @@ using namespace shogun; std::shared_ptr OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) { - auto reader = OpenMLReader(api_key); + auto reader = OpenMLFile(api_key); auto return_string = reader.get(split_url); if (return_string == "Task not providing datasplits.") diff --git a/src/shogun/io/openml/OpenMLTask.cpp b/src/shogun/io/openml/OpenMLTask.cpp index c4a4c1fb00c..f41bd3d0cc7 100644 --- a/src/shogun/io/openml/OpenMLTask.cpp +++ b/src/shogun/io/openml/OpenMLTask.cpp @@ -4,7 +4,7 @@ * Authors: Gil Hoben */ -#include +#include #include #include @@ -12,23 +12,34 @@ using namespace shogun; using namespace shogun::openml_detail; using namespace rapidjson; +struct DatasetStruct +{ + std::string dataset_id; + std::string target_feature; +}; + +struct SplitStruct +{ + std::string split_id; + std::string type; + std::string data_splits_url; + std::unordered_map split_parameters; +}; + std::shared_ptr OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) { - Document document; std::string task_name; std::string task_type_id; - std::shared_ptr openml_dataset = nullptr; - std::shared_ptr openml_split = nullptr; std::unordered_map evaluation_measures; - auto reader = OpenMLReader(api_key); - auto return_string = reader.get("task_file", "json", task_id); + DatasetStruct dataset_struct{}; + SplitStruct split_struct{}; - document.Parse(return_string.c_str()); - check_response(document, "task"); + auto reader = OpenMLFile(api_key); + auto return_string = reader.get("task_file", "json", task_id); - const Value& root = document["task"]; + auto& root = check_response(return_string, "task"); REQUIRE( task_id == root["task_id"].GetString(), @@ -42,7 +53,7 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) task_type_id = root["task_type_id"].GetString(); // expect two elements in input array: dataset and split - const Value& json_input = root["input"]; + auto& json_input = root["input"]; auto input_array = json_input.GetArray(); @@ -51,19 +62,21 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) if (strcmp(task_settings["name"].GetString(), "source_data") == 0) { auto dataset_info = task_settings["data_set"].GetObject(); - std::string dataset_id = dataset_info["data_set_id"].GetString(); - std::string target_feature = - dataset_info["target_feature"].GetString(); - openml_dataset = OpenMLData::get_dataset(dataset_id, api_key); + add_string_to_struct( + dataset_info, "data_set_id", dataset_struct.dataset_id); + add_string_to_struct( + dataset_info, "target_feature", dataset_struct.target_feature); } else if ( strcmp(task_settings["name"].GetString(), "estimation_procedure") == 0) { auto split_info = task_settings["estimation_procedure"].GetObject(); - std::string split_id = split_info["id"].GetString(); - std::string split_type = split_info["type"].GetString(); - std::string split_url = split_info["data_splits_url"].GetString(); + add_string_to_struct(split_info, "id", split_struct.split_id); + add_string_to_struct(split_info, "type", split_struct.type); + add_string_to_struct( + split_info, "data_splits_url", split_struct.data_splits_url); + std::unordered_map split_parameters; for (const auto& param : split_info["parameter"].GetArray()) { @@ -77,11 +90,12 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) "Unexpected number of parameters in parameter array " "of estimation_procedure.\n") } + split_struct.split_parameters = split_parameters; + REQUIRE( - split_type == "crossvalidation", + split_struct.type == "crossvalidation", "Currently only tasks with cross validation are enabled in " "shogun!\n") - openml_split = OpenMLSplit::get_split(split_url, api_key); } else if ( strcmp(task_settings["name"].GetString(), "evaluation_measures") == @@ -100,8 +114,17 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key) } } - if (openml_dataset == nullptr && openml_split == nullptr) - SG_SERROR("Error parsing task.\n") + REQUIRE( + !dataset_struct.dataset_id.empty(), + "The dataset ID is required to retrieve the dataset!\n") + auto openml_dataset = + OpenMLData::get_dataset(dataset_struct.dataset_id, api_key); + + REQUIRE( + !split_struct.data_splits_url.empty(), + "The split URL is required to retrieve the split information!\n") + auto openml_split = + OpenMLSplit::get_split(split_struct.data_splits_url, api_key); auto result = std::make_shared( task_id, task_name, task_type, task_type_id, evaluation_measures, @@ -133,7 +156,7 @@ OpenMLTask::get_test_indices() const std::vector>> OpenMLTask::get_indices(const std::array, 3>& idx) const { - // result = (n_repeats, n_folds, ?) where is the number of indices in a + // result = (n_repeats, n_folds, ?) where ? is the number of indices in a // given fold std::vector>> result( m_split->get_num_repeats(), diff --git a/src/shogun/io/openml/OpenMLTask.h b/src/shogun/io/openml/OpenMLTask.h index 9f5e97c1bef..c248ee7d109 100644 --- a/src/shogun/io/openml/OpenMLTask.h +++ b/src/shogun/io/openml/OpenMLTask.h @@ -62,7 +62,8 @@ namespace shogun return m_split; } - std::vector>> get_train_indices() const; + std::vector>> + get_train_indices() const; std::vector>> get_test_indices() const; @@ -76,14 +77,16 @@ namespace shogun return m_split->get_num_repeats(); } -#ifndef SWIG - SG_FORCED_INLINE TaskType + std::string get_task_name() const noexcept + { + return m_task_name; + } - get_task_type() const noexcept +#ifndef SWIG + SG_FORCED_INLINE TaskType get_task_type() const noexcept { return m_task_type; } - #endif // SWIG private: diff --git a/src/shogun/io/openml/ShogunOpenML.cpp b/src/shogun/io/openml/ShogunOpenML.cpp index aff02d90a62..ed73099f1c3 100644 --- a/src/shogun/io/openml/ShogunOpenML.cpp +++ b/src/shogun/io/openml/ShogunOpenML.cpp @@ -1,3 +1,5 @@ +#include + /* * This software is distributed under BSD 3-clause license (see LICENSE file). * @@ -22,12 +24,14 @@ class StringToShogun : public AnyVisitor { public: explicit StringToShogun(std::shared_ptr model) - : m_model(model), m_parameter(""), m_string_val(""){}; + : m_model(std::move(model)), m_parameter(""), + m_string_val(""){SG_SDEBUG("Debugging StringToShogun\n")}; StringToShogun( - std::shared_ptr model, const std::string& parameter, - const std::string& string_val) - : m_model(model), m_parameter(parameter), m_string_val(string_val){}; + std::shared_ptr model, const std::string& parameter, + const std::string& string_val) + : m_model(std::move(model)), m_parameter(parameter), + m_string_val(string_val){SG_SDEBUG("Debugging StringToShogun\n")}; void on(bool* v) final { @@ -53,11 +57,11 @@ class StringToShogun : public AnyVisitor // it's an option, i.e. internally represented // as an enum but in swig exposed as a string m_string_val.erase( - std::remove_if( - m_string_val.begin(), m_string_val.end(), - // remove quotes - [](const auto& val) { return val == '\"'; }), - m_string_val.end()); + std::remove_if( + m_string_val.begin(), m_string_val.end(), + // remove quotes + [](const auto& val) { return val == '\"'; }), + m_string_val.end()); m_model->put(m_parameter, m_string_val); } } @@ -93,7 +97,7 @@ class StringToShogun : public AnyVisitor void on(long double* v) { SG_SDEBUG( - "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + "long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) if (!is_null()) { floatmax_t result = std::stold(m_string_val); @@ -103,38 +107,38 @@ class StringToShogun : public AnyVisitor void on(CSGObject** v) final { SG_SDEBUG( - "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + "CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGVector* v) final { SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + "SGVector: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGVector* v) final { SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGVector* v) final { SG_SDEBUG( - "SGVector: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) + "SGVector: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGMatrix* mat) final { SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str()) } void on(SGMatrix* mat) final { SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), - m_string_val.c_str()) + "SGMatrix: %s=%s\n", m_parameter.c_str(), + m_string_val.c_str()) } void on(SGMatrix* mat) final{SG_SDEBUG( - "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} + "SGMatrix: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())} /** * In OpenML "null" is an empty parameter value field. @@ -169,7 +173,7 @@ class StringToShogun : public AnyVisitor * @return the instantiated object using a factory */ std::shared_ptr instantiate_model_from_factory( - const std::string& factory_name, const std::string& algo_name) + const std::string& factory_name, const std::string& algo_name) { if (factory_name == "machine") return std::shared_ptr(machine(algo_name)); @@ -190,9 +194,9 @@ std::shared_ptr instantiate_model_from_factory( * @param parameter_name the name of nested_obj */ void cast_and_put( - const std::shared_ptr& obj, - const std::shared_ptr& nested_obj, - const std::string& parameter_name) + const std::shared_ptr& obj, + const std::shared_ptr& nested_obj, + const std::string& parameter_name) { if (auto casted_obj = std::dynamic_pointer_cast(nested_obj)) { @@ -218,7 +222,7 @@ void cast_and_put( } std::shared_ptr ShogunOpenML::flow_to_model( - std::shared_ptr flow, bool initialize_with_defaults) + std::shared_ptr flow, bool initialize_with_defaults) { auto params = flow->get_parameters(); auto components = flow->get_components(); @@ -237,7 +241,7 @@ std::shared_ptr ShogunOpenML::flow_to_model( { Any any_val = obj_param.at(param.first)->get_value(); std::string name = param.first; - std::string val_as_string = param.second.at("default_value"); + std::string val_as_string = param.second.default_value; visitor->set_parameter_name(name); visitor->set_string_value(val_as_string); any_val.visit(visitor.get()); @@ -247,7 +251,7 @@ std::shared_ptr ShogunOpenML::flow_to_model( for (const auto& component : components) { std::shared_ptr nested_obj = - flow_to_model(component.second, initialize_with_defaults); + flow_to_model(component.second, initialize_with_defaults); cast_and_put(obj, nested_obj, component.first); } @@ -280,61 +284,199 @@ ShogunOpenML::get_class_info(const std::string& class_name) class_components.emplace_back(std::string(begin, std::next(it))); } - if (class_components[0] == "shogun" && class_components.size() == 3) + if (class_components.empty()) + SG_SERROR( + "Error parsing flow class name \"%s\"!\n", class_name.c_str()); + + if (class_components.size() == 3 && class_components[0] == "shogun") result = std::make_pair(class_components[1], class_components[2]); - else if (class_components[0] == "shogun" && class_components.size() != 3) + else if (class_components.size() != 3 && class_components[0] == "shogun") SG_SERROR("Invalid class name format %s.\n", class_name.c_str()) else SG_SERROR( - "The provided flow is not meant for shogun deserialisation! The " - "required library is \"%s\".\n", - class_components[0].c_str()) + "The provided flow is not meant for shogun deserialisation! The " + "required library is \"%s\".\n", + class_components[0].c_str()) return result; } -std::shared_ptr ShogunOpenML::run_model_on_fold( - const std::shared_ptr& model, - const std::shared_ptr& task, - const std::shared_ptr& X_train, index_t repeat_number, - index_t fold_number, const std::shared_ptr& y_train, - const std::shared_ptr& X_test) +std::unique_ptr ShogunOpenML::run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels, const SGVector& train_idx, + const SGVector& test_idx, index_t repeat_idx, index_t fold_idx) { auto task_type = task->get_task_type(); - auto model_clone = std::shared_ptr(model->clone()); switch (task_type) { - case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: - case OpenMLTask::TaskType::SUPERVISED_REGRESSION: - { - if (auto machine = std::dynamic_pointer_cast(model_clone)) - { - // TODO: refactor! more useless clones until smart pointers are merged - machine->put("labels", y_train->clone()->as()); - auto tmp = X_train.get(); - machine->train(tmp); - if (X_test) - return std::shared_ptr(machine->apply(X_test.get())); - else - return std::shared_ptr(machine->apply(X_train.get())); - } - else - SG_SERROR("The provided model is not a trainable machine!\n") - } - break; - case OpenMLTask::TaskType::LEARNING_CURVE: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::CLUSTERING: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: - SG_SNOTIMPLEMENTED - case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: - SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + // copied/adapted from crossvalidation + auto fold = std::make_unique(); + auto* cloned_machine = machine->clone()->as(); + + // TODO while these are not used through const interfaces, + // we unfortunately have to clone, even though these could be + // shared + auto* features_clone = features->clone()->as(); + auto* labels_clone = labels->clone()->as(); + // auto* evaluation_criterion = + // (CEvaluation*)m_evaluation_criterion->clone(); + + /* evtl. update xvalidation output class */ + fold->set_run_index(repeat_idx); + fold->set_fold_index(fold_idx); + + /* set feature and label subset for training */ + features_clone->add_subset(train_idx); + labels_clone->add_subset(train_idx); + + SG_SDEBUG( + "train set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + train_idx.to_string().c_str()) + + /* train machine on training features and remove subset */ + SG_SDEBUG("starting training\n") + cloned_machine->set_labels(labels_clone); + cloned_machine->train(features_clone); + SG_SDEBUG("finished training\n") + + /* evtl. update xvalidation output class */ + fold->set_train_indices(train_idx); + auto* fold_machine = cloned_machine->clone()->as(); + fold->set_trained_machine(fold_machine); + SG_UNREF(fold_machine) + + features_clone->remove_subset(); + labels_clone->remove_subset(); + + /* set features and label subset for testing */ + features_clone->add_subset(test_idx); + labels_clone->add_subset(test_idx); + + SG_SDEBUG( + "test set repeat %d fold %d: %s\n", repeat_idx, fold_idx, + test_idx.to_string().c_str()) + + /* apply machine to test features and remove subset */ + SG_SDEBUG("starting evaluation\n") + SG_SDEBUG("%p\n", features_clone) + CLabels* result_labels = cloned_machine->apply(features_clone); + SG_SDEBUG("finished evaluation\n") + features_clone->remove_subset(); + SG_REF(result_labels); + + /* evaluate */ + // results[i] = evaluation_criterion->evaluate(result_labels, labels); + // SG_DEBUG("result on fold %d is %f\n", i, results[i]) + + /* evtl. update xvalidation output class */ + fold->set_test_indices(test_idx); + fold->set_test_result(result_labels); + auto* true_labels = (CLabels*)labels->clone(); + fold->set_test_true_result(true_labels); + SG_UNREF(true_labels) + fold->post_update_results(); + // fold->set_evaluation_result(results[i]); + + /* clean up, remove subsets */ + labels->remove_subset(); + SG_UNREF(cloned_machine); + SG_UNREF(features_clone); + SG_UNREF(labels_clone); + // SG_UNREF(evaluation_criterion); + SG_UNREF(result_labels); + return fold; + } + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + return nullptr; +} + +std::unique_ptr ShogunOpenML::run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels) +{ + auto task_type = task->get_task_type(); + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + { + auto fold = std::make_unique(); + auto* cloned_machine = machine->clone()->as(); + auto* features_clone = features->clone()->as(); + auto* labels_clone = labels->clone()->as(); + fold->set_run_index(0); + fold->set_fold_index(0); + + /* train machine on training features */ + SG_SDEBUG("starting training\n") + cloned_machine->set_labels(labels_clone); + cloned_machine->train(features_clone); + SG_SDEBUG("finished training\n") + + auto* fold_machine = cloned_machine->clone()->as(); + fold->set_trained_machine(fold_machine); + SG_UNREF(fold_machine) + + /* apply machine to test features */ + SG_SDEBUG("starting evaluation\n") + SG_SDEBUG("%p\n", features_clone) + CLabels* result_labels = cloned_machine->apply(features_clone); + SG_SDEBUG("finished evaluation\n") + + /* evaluate */ + // results[i] = evaluation_criterion->evaluate(result_labels, labels); + // SG_DEBUG("result on fold %d is %f\n", i, results[i]) + + /* evtl. update xvalidation output class */ + fold->set_test_result(result_labels); + auto* true_labels = (CLabels*)labels->clone(); + fold->set_test_true_result(true_labels); + SG_UNREF(true_labels) + fold->post_update_results(); + // fold->set_evaluation_result(results[i]); + + // cleanup + SG_UNREF(cloned_machine); + SG_UNREF(features_clone); + SG_UNREF(labels_clone); + // SG_UNREF(evaluation_criterion); + SG_UNREF(result_labels); + return fold; + } + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED } return nullptr; } \ No newline at end of file diff --git a/src/shogun/io/openml/ShogunOpenML.h b/src/shogun/io/openml/ShogunOpenML.h index ff5257f19b8..88f0e19a842 100644 --- a/src/shogun/io/openml/ShogunOpenML.h +++ b/src/shogun/io/openml/ShogunOpenML.h @@ -8,6 +8,7 @@ #define SHOGUN_SHOGUNOPENML_H #include +#include #include #include @@ -44,12 +45,21 @@ namespace shogun model_to_flow(const std::shared_ptr& model); protected: - static std::shared_ptr run_model_on_fold( - const std::shared_ptr& model, - const std::shared_ptr& task, - const std::shared_ptr& X_train, index_t repeat_number, - index_t fold_number, const std::shared_ptr& y_train, - const std::shared_ptr& X_test); + static std::unique_ptr run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels, + const SGVector& train_idx, + const SGVector& test_id, + index_t repeat_number, + index_t fold_number); + + static std::unique_ptr run_model_on_fold( + const std::shared_ptr& machine, + const std::shared_ptr& task, + const std::shared_ptr& features, + const std::shared_ptr& labels); private: /** diff --git a/src/shogun/io/openml/utils.h b/src/shogun/io/openml/utils.h index 122ce170098..b9f0d0ec1bf 100644 --- a/src/shogun/io/openml/utils.h +++ b/src/shogun/io/openml/utils.h @@ -15,65 +15,97 @@ namespace shogun { namespace openml_detail { + enum class BACKEND_FORMAT + { + JSON = 0, + XML = 1, + }; + /** * Checks the returned response from OpenML in JSON format * @param doc the parsed OpenML JSON format response */ - static void - check_response(const rapidjson::Document& doc, const std::string& type) + template < + BACKEND_FORMAT FormatT, + typename std::enable_if_t* = + nullptr> + const rapidjson::Value& + check_response(const std::string& val, const std::string& root_name) { + rapidjson::Document doc; + doc.Parse(val.c_str()); + if (SG_UNLIKELY(doc.HasMember("error"))) { const rapidjson::Value& root = doc["error"]; SG_SERROR( "Server error %s: %s\n", root["code"].GetString(), root["message"].GetString()) - return; } REQUIRE( - doc.HasMember(type.c_str()), - "Unexpected format of OpenML %s.\n", type.c_str()); + doc.HasMember(root_name.c_str()), + "Unexpected format of OpenML %s.\n", root_name.c_str()); + + return doc[root_name.c_str()]; } /** - * Helper function to add JSON objects as string in map - * @param v a RapidJSON GenericValue, i.e. string - * @param param_dict the map to write to - * @param name the name of the key + * Checks the returned response from OpenML in XML format + * @param doc the parsed OpenML XML format response */ - static SG_FORCED_INLINE void emplace_string_to_map( - const rapidjson::GenericValue>& v, - std::unordered_map& param_dict, - const std::string& name, bool required = false) + template < + BACKEND_FORMAT FormatT, + typename std::enable_if_t* = + nullptr> + void check_response(const std::string& val, const std::string& type) { - if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else if (required) - SG_SERROR( - "The field \"%s\" is expected to be a string!\n", - name.c_str()) - else - param_dict.emplace(name, ""); + SG_SNOTIMPLEMENTED } - /** - * Helper function to add JSON objects as string in map - * @param v a RapidJSON GenericObject, i.e. array - * @param param_dict the map to write to - * @param name the name of the key - */ - static SG_FORCED_INLINE void emplace_string_to_map( + template + static SG_FORCED_INLINE void add_string_to_struct( const rapidjson::GenericObject< true, rapidjson::GenericValue>>& v, - std::unordered_map& param_dict, - const std::string& name) + const std::string& name, T& custom_struct) { if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) - param_dict.emplace(name, v[name.c_str()].GetString()); - else - param_dict.emplace(name, ""); + custom_struct = v[name.c_str()].GetString(); } + template + static SG_FORCED_INLINE void add_string_to_struct( + const rapidjson::GenericValue>& v, + const std::string& name, T& custom_struct) + { + if (v[name.c_str()].GetType() == rapidjson::Type::kStringType) + custom_struct = v[name.c_str()].GetString(); + } + + template + SG_FORCED_INLINE T must_return( + const std::string& name, + const rapidjson::GenericValue>& v) + { + SG_SNOTIMPLEMENTED + } + + template <> + SG_FORCED_INLINE std::string must_return( + const std::string& name, + const rapidjson::GenericValue>& v) + { + if (v.HasMember(name.c_str()) && v[name.c_str()].IsString()) + return v[name.c_str()].GetString(); + if (v.HasMember(name.c_str()) && !v[name.c_str()].IsString()) + SG_SERROR( + "Found member \"%s\" but it is not a string", name.c_str()) + if (!v.HasMember(name.c_str())) + SG_SERROR( + "\"%s\" is not a member of the given object", name.c_str()) + return nullptr; + } + + template SG_FORCED_INLINE T return_if_possible( const std::string& name, @@ -96,8 +128,6 @@ namespace shogun "Found member \"%s\" but it is not a string", name.c_str()) if (!v.HasMember(name.c_str())) return ""; - SG_SERROR( - "\"%s\" is not a member of the given object", name.c_str()) return nullptr; } From c2cf37b027c37e3c629d43f568f4d5ee88f7fd1c Mon Sep 17 00:00:00 2001 From: gf712 Date: Fri, 24 May 2019 14:32:13 +0100 Subject: [PATCH 31/32] fixes for new arff api --- src/shogun/io/openml/OpenMLData.cpp | 66 ++++++++++++++++------------ src/shogun/io/openml/OpenMLData.h | 17 +++---- src/shogun/io/openml/OpenMLRun.cpp | 16 +++---- src/shogun/io/openml/OpenMLSplit.cpp | 21 +++------ 4 files changed, 58 insertions(+), 62 deletions(-) diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp index 728cb6b1484..5f88fcbfb24 100644 --- a/src/shogun/io/openml/OpenMLData.cpp +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -23,7 +23,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) auto reader = OpenMLFile(api_key); auto return_string = reader.get("dataset_description", "json", id); - auto& dataset_description = check_response(return_string, "data_set_description"); + auto& dataset_description = check_response( + return_string, "data_set_description"); auto name = return_if_possible( "name", dataset_description.GetObject()); @@ -77,7 +78,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) param_vector; return_string = reader.get("data_features", "json", id); - auto& dataset_features = check_response(return_string, "data_features"); + auto& dataset_features = + check_response(return_string, "data_features"); for (const auto& param : dataset_features["feature"].GetArray()) { @@ -100,7 +102,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) std::vector> qualities_vector; return_string = reader.get("data_qualities", "json", id); - auto& data_qualities = check_response(return_string, "data_qualities"); + auto& data_qualities = + check_response(return_string, "data_qualities"); for (const auto& param : data_qualities["quality"].GetArray()) { @@ -129,14 +132,16 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key) std::shared_ptr OpenMLData::get_features() noexcept { - if (!m_cached_features) - get_data(); - return m_cached_features; + // if (!m_cached_features) + // get_data(); + // return m_cached_features; + SG_SNOTIMPLEMENTED + return nullptr; } std::shared_ptr OpenMLData::get_features(const std::string& label) { - if (!m_cached_features) + if (m_cached_features.empty()) get_data(); auto find_label = std::find(m_feature_names.begin(), m_feature_names.end(), label); @@ -147,39 +152,42 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) feat_type_copy.erase(feat_type_copy.begin() + col_idx); for (const auto type : feat_type_copy) { - if (type == ARFFDeserializer::Attribute::STRING) + if (type == Attribute::STRING) SG_SNOTIMPLEMENTED } - // auto result = std::make_shared(); + std::shared_ptr> result; bool first = true; + size_t n_examples = 0; for (int i = 0; i < m_feature_types.size(); ++i) { if (i != col_idx && first) { - result.reset(m_cached_features->get_feature_obj(i) - ->as>()); + result.reset(m_cached_features[0]->as>()); + n_examples = result->get_num_vectors(); first = false; } if (i != col_idx) - result.reset( - result - ->create_merged_copy(m_cached_features->get_feature_obj(i)) - ->as>()); + { + REQUIRE( + n_examples == m_cached_features[i]->get_num_vectors(), + "Expected all features to have the same number of examples!\n") + result.reset(result->create_merged_copy(m_cached_features[i].get()) + ->as>()); + } } + REQUIRE(n_examples != 0, "No features extracted!\n") + // need to copy data as result is only in the stack and the data // will be gone at the end of the function - auto* copy_feat = SG_MALLOC( - float64_t, - m_feature_types.size() * m_cached_features->get_num_vectors()); + auto* copy_feat = SG_MALLOC(float64_t, m_feature_types.size() * n_examples); memcpy( copy_feat, result->get_feature_matrix().data(), - m_feature_types.size() * m_cached_features->get_num_vectors()); + m_feature_types.size() * m_cached_features.size()); result = std::make_shared>( - copy_feat, m_feature_types.size(), - m_cached_features->get_num_vectors()); + copy_feat, m_feature_types.size(), n_examples); return result; } @@ -197,7 +205,7 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) if (m_cached_labels && label_name == m_cached_label_name) return m_cached_labels; - if (!m_cached_features) + if (m_cached_features.empty()) get_data(); auto find_label = @@ -207,16 +215,16 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) "Requested label \"%s\" not in the dataset!\n", label_name.c_str()) auto col_idx = std::distance(m_feature_names.begin(), find_label); - auto target_label_as_feat = - std::shared_ptr(m_cached_features->get_feature_obj(col_idx)); + std::shared_ptr target_label_as_feat = + m_cached_features[col_idx]; switch (m_feature_types[col_idx]) { // real features - case ARFFDeserializer::Attribute::REAL: - case ARFFDeserializer::Attribute::NUMERIC: - case ARFFDeserializer::Attribute::INTEGER: - case ARFFDeserializer::Attribute::DATE: + case Attribute::REAL: + case Attribute::NUMERIC: + case Attribute::INTEGER: + case Attribute::DATE: { auto casted_feat = std::dynamic_pointer_cast>( target_label_as_feat); @@ -228,7 +236,7 @@ std::shared_ptr OpenMLData::get_labels(const std::string& label_name) } break; // nominal features - case ARFFDeserializer::Attribute::NOMINAL: + case Attribute::NOMINAL: { auto casted_feat = std::dynamic_pointer_cast>( target_label_as_feat); diff --git a/src/shogun/io/openml/OpenMLData.h b/src/shogun/io/openml/OpenMLData.h index 25bf54f3084..022eb06ad8d 100644 --- a/src/shogun/io/openml/OpenMLData.h +++ b/src/shogun/io/openml/OpenMLData.h @@ -11,8 +11,8 @@ #include #include -#include #include +#include namespace shogun { @@ -90,7 +90,6 @@ namespace shogun */ std::shared_ptr get_labels(); - /** * Returns the dataset labels given the label_name * @return the labels @@ -101,16 +100,14 @@ namespace shogun * Returns the type of all attributes/features in the ARFF file * @return */ - SG_FORCED_INLINE std::vector - - get_feature_types() const noexcept + SG_FORCED_INLINE std::vector get_feature_types() const + noexcept { return m_feature_types; } - SG_FORCED_INLINE std::string - - get_default_target_attribute() const noexcept + SG_FORCED_INLINE std::string get_default_target_attribute() const + noexcept { return m_default_target_attribute; } @@ -153,9 +150,9 @@ namespace shogun m_param_qualities; std::string m_api_key; - std::shared_ptr m_cached_features; + std::vector> m_cached_features; std::vector m_feature_names; - std::vector m_feature_types; + std::vector m_feature_types; std::shared_ptr m_cached_labels; std::string m_cached_label_name; }; diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp index 465c5045c21..da28c07994d 100644 --- a/src/shogun/io/openml/OpenMLRun.cpp +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -88,22 +88,22 @@ std::shared_ptr OpenMLRun::run_flow_on_task( SG_SDEBUG("End of openml run: %s\n", xval_storage->to_string().c_str()); return std::make_shared( - nullptr, // uploader - nullptr, // uploader_name - nullptr, // setup_id - nullptr, // setup_string - nullptr, // parameter_settings + std::string{}, // uploader + std::string{}, // uploader_name + std::string{}, // setup_id + std::string{}, // setup_string + std::string{}, // parameter_settings std::vector{}, // evaluations std::vector{}, // fold_evaluations std::vector{}, // sample_evaluations - nullptr, // data_content + std::string{}, // data_content std::vector{}, // output_files task, // task flow, // flow - nullptr, // run_id + std::string{}, // run_id model, // model std::vector{}, // tags - nullptr // predictions_url + std::string{} // predictions_url ); } diff --git a/src/shogun/io/openml/OpenMLSplit.cpp b/src/shogun/io/openml/OpenMLSplit.cpp index 42d14520168..04d6206ed5f 100644 --- a/src/shogun/io/openml/OpenMLSplit.cpp +++ b/src/shogun/io/openml/OpenMLSplit.cpp @@ -29,27 +29,18 @@ OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key) arff_parser.read(); auto arff_features = arff_parser.get_features(); REQUIRE( - arff_features->get_num_feature_obj() == 4, + arff_features.size() == 4, "Expected a ARFF file with 4 attributes: type, rowid, repeat and " "fold.\n") - auto train_test_feat = - std::shared_ptr(arff_features->get_feature_obj(0)); - auto rowid_feat = - std::shared_ptr(arff_features->get_feature_obj(1)); - auto repeat_feat = - std::shared_ptr(arff_features->get_feature_obj(2)); - auto fold_feat = - std::shared_ptr(arff_features->get_feature_obj(3)); - - auto type_vector = nominal_feature_to_vector(train_test_feat); - auto rowid_vector = dense_feature_to_vector(rowid_feat); - auto repeat_vector = dense_feature_to_vector(repeat_feat); - auto fold_vector = dense_feature_to_vector(fold_feat); + auto type_vector = nominal_feature_to_vector(arff_features[0]); + auto rowid_vector = dense_feature_to_vector(arff_features[1]); + auto repeat_vector = dense_feature_to_vector(arff_features[2]); + auto fold_vector = dense_feature_to_vector(arff_features[3]); std::array, 3> train_idx, test_idx; - for (int i = 0; i < train_test_feat->get_num_vectors(); ++i) + for (int i = 0; i < arff_features[0]->get_num_vectors(); ++i) { if (type_vector[i] == LabelType::TRAIN) { From 6616ecf8bf5a0ec33e91e37b7cea0433d4ef5177 Mon Sep 17 00:00:00 2001 From: gf712 Date: Wed, 29 May 2019 10:56:59 +0100 Subject: [PATCH 32/32] added metric evaluation --- src/shogun/io/openml/OpenMLData.cpp | 4 +- src/shogun/io/openml/OpenMLRun.cpp | 23 +++++---- src/shogun/io/openml/OpenMLRun.h | 15 +++--- src/shogun/io/openml/ShogunOpenML.cpp | 72 +++++++++++++++++++++++---- 4 files changed, 80 insertions(+), 34 deletions(-) diff --git a/src/shogun/io/openml/OpenMLData.cpp b/src/shogun/io/openml/OpenMLData.cpp index 5f88fcbfb24..4906c6db345 100644 --- a/src/shogun/io/openml/OpenMLData.cpp +++ b/src/shogun/io/openml/OpenMLData.cpp @@ -186,10 +186,8 @@ std::shared_ptr OpenMLData::get_features(const std::string& label) copy_feat, result->get_feature_matrix().data(), m_feature_types.size() * m_cached_features.size()); - result = std::make_shared>( + return std::make_shared>( copy_feat, m_feature_types.size(), n_examples); - - return result; } std::shared_ptr OpenMLData::get_labels() diff --git a/src/shogun/io/openml/OpenMLRun.cpp b/src/shogun/io/openml/OpenMLRun.cpp index da28c07994d..8cd761d6015 100644 --- a/src/shogun/io/openml/OpenMLRun.cpp +++ b/src/shogun/io/openml/OpenMLRun.cpp @@ -4,7 +4,6 @@ * Authors: Gil Hoben */ -#include #include #include #include @@ -47,7 +46,7 @@ std::shared_ptr OpenMLRun::run_flow_on_task( SG_SERROR("INTERNAL ERROR: failed to cast model to machine!\n") } - auto* xval_storage = new CrossValidationStorage(); + auto xval_storage = std::make_shared(); if (task->get_split()->contains_splits()) { @@ -64,11 +63,12 @@ std::shared_ptr OpenMLRun::run_flow_on_task( for (auto fold_idx : range(task->get_num_fold())) { SGVector train_i_idx( - train_idx[repeat_idx][fold_idx].data(), - train_idx[repeat_idx][fold_idx].size()); + train_idx[repeat_idx][fold_idx].begin(), + train_idx[repeat_idx][fold_idx].end()); SGVector test_i_idx( - train_idx[repeat_idx][fold_idx].data(), - train_idx[repeat_idx][fold_idx].size()); + test_idx[repeat_idx][fold_idx].begin(), + test_idx[repeat_idx][fold_idx].end()); + xval_storage->append_fold_result( ShogunOpenML::run_model_on_fold( machine, task, features, labels, train_i_idx, @@ -93,9 +93,7 @@ std::shared_ptr OpenMLRun::run_flow_on_task( std::string{}, // setup_id std::string{}, // setup_string std::string{}, // parameter_settings - std::vector{}, // evaluations - std::vector{}, // fold_evaluations - std::vector{}, // sample_evaluations + xval_storage, // xval_storage std::string{}, // data_content std::vector{}, // output_files task, // task @@ -119,7 +117,10 @@ void OpenMLRun::to_filesystem(const std::string& directory) const SG_SNOTIMPLEMENTED } -void OpenMLRun::publish() const +void OpenMLRun::publish() const {SG_SNOTIMPLEMENTED} + +std::unique_ptr OpenMLRun::to_xml() const { - SG_SNOTIMPLEMENTED + + return std::unique_ptr(); } diff --git a/src/shogun/io/openml/OpenMLRun.h b/src/shogun/io/openml/OpenMLRun.h index 3619bed62d1..86e3379be31 100644 --- a/src/shogun/io/openml/OpenMLRun.h +++ b/src/shogun/io/openml/OpenMLRun.h @@ -8,6 +8,7 @@ #define SHOGUN_OPENMLRUN_H #include +#include #include #include @@ -20,9 +21,7 @@ namespace shogun { const std::string& uploader, const std::string& uploader_name, const std::string& setup_id, const std::string& setup_string, const std::string& parameter_settings, - std::vector evaluations, - std::vector fold_evaluations, - std::vector sample_evaluations, + std::shared_ptr xval_storage, const std::string& data_content, std::vector output_files, std::shared_ptr task, std::shared_ptr flow, @@ -31,9 +30,7 @@ namespace shogun { : m_uploader(uploader), m_uploader_name(uploader_name), m_setup_id(setup_id), m_setup_string(setup_string), m_parameter_settings(parameter_settings), - m_evaluations(std::move(evaluations)), - m_fold_evaluations(std::move(fold_evaluations)), - m_sample_evaluations(std::move(sample_evaluations)), + m_xval_storage(xval_storage), m_data_content(data_content), m_output_files(std::move(output_files)), m_task(std::move(task)), m_flow(std::move(flow)), m_run_id(run_id), @@ -55,6 +52,8 @@ namespace shogun { void to_filesystem(const std::string& directory) const; + std::unique_ptr to_xml() const; + void publish() const; private: @@ -63,9 +62,7 @@ namespace shogun { std::string m_setup_id; std::string m_setup_string; std::string m_parameter_settings; - std::vector m_evaluations; - std::vector m_fold_evaluations; - std::vector m_sample_evaluations; + std::shared_ptr m_xval_storage; std::string m_data_content; std::vector m_output_files; std::shared_ptr m_task; diff --git a/src/shogun/io/openml/ShogunOpenML.cpp b/src/shogun/io/openml/ShogunOpenML.cpp index ed73099f1c3..87987e3036f 100644 --- a/src/shogun/io/openml/ShogunOpenML.cpp +++ b/src/shogun/io/openml/ShogunOpenML.cpp @@ -6,6 +6,8 @@ * Authors: Gil Hoben */ +#include +#include #include #include @@ -310,6 +312,30 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( { auto task_type = task->get_task_type(); + CEvaluation* evaluation_criterion = nullptr; + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + evaluation_criterion = new CAccuracyMeasure(); + break; + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + evaluation_criterion = new CMeanAbsoluteError(); + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + switch (task_type) { case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: @@ -324,8 +350,6 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( // shared auto* features_clone = features->clone()->as(); auto* labels_clone = labels->clone()->as(); - // auto* evaluation_criterion = - // (CEvaluation*)m_evaluation_criterion->clone(); /* evtl. update xvalidation output class */ fold->set_run_index(repeat_idx); @@ -371,8 +395,10 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( SG_REF(result_labels); /* evaluate */ - // results[i] = evaluation_criterion->evaluate(result_labels, labels); - // SG_DEBUG("result on fold %d is %f\n", i, results[i]) + auto result = + evaluation_criterion->evaluate(result_labels, labels_clone); + SG_SINFO( + "result on repeat %d fold %d is %f\n", repeat_idx, fold_idx, result) /* evtl. update xvalidation output class */ fold->set_test_indices(test_idx); @@ -381,18 +407,17 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( fold->set_test_true_result(true_labels); SG_UNREF(true_labels) fold->post_update_results(); - // fold->set_evaluation_result(results[i]); + fold->set_evaluation_result(result); /* clean up, remove subsets */ labels->remove_subset(); SG_UNREF(cloned_machine); SG_UNREF(features_clone); SG_UNREF(labels_clone); - // SG_UNREF(evaluation_criterion); SG_UNREF(result_labels); + delete evaluation_criterion; return fold; } - break; case OpenMLTask::TaskType::LEARNING_CURVE: SG_SNOTIMPLEMENTED case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: @@ -417,6 +442,30 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( { auto task_type = task->get_task_type(); + CEvaluation* evaluation_criterion = nullptr; + + switch (task_type) + { + case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: + evaluation_criterion = new CAccuracyMeasure(); + break; + case OpenMLTask::TaskType::SUPERVISED_REGRESSION: + evaluation_criterion = new CMeanAbsoluteError(); + break; + case OpenMLTask::TaskType::LEARNING_CURVE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUPERVISED_DATASTREAM_CLASSIFICATION: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::CLUSTERING: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::MACHINE_LEARNING_CHALLENGE: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SURVIVAL_ANALYSIS: + SG_SNOTIMPLEMENTED + case OpenMLTask::TaskType::SUBGROUP_DISCOVERY: + SG_SNOTIMPLEMENTED + } + switch (task_type) { case OpenMLTask::TaskType::SUPERVISED_CLASSIFICATION: @@ -446,8 +495,9 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( SG_SDEBUG("finished evaluation\n") /* evaluate */ - // results[i] = evaluation_criterion->evaluate(result_labels, labels); - // SG_DEBUG("result on fold %d is %f\n", i, results[i]) + auto result = + evaluation_criterion->evaluate(result_labels, labels_clone); + SG_SINFO("result is %f\n", result) /* evtl. update xvalidation output class */ fold->set_test_result(result_labels); @@ -455,14 +505,14 @@ std::unique_ptr ShogunOpenML::run_model_on_fold( fold->set_test_true_result(true_labels); SG_UNREF(true_labels) fold->post_update_results(); - // fold->set_evaluation_result(results[i]); + fold->set_evaluation_result(result); // cleanup SG_UNREF(cloned_machine); SG_UNREF(features_clone); SG_UNREF(labels_clone); - // SG_UNREF(evaluation_criterion); SG_UNREF(result_labels); + delete evaluation_criterion; return fold; } case OpenMLTask::TaskType::LEARNING_CURVE: