Skip to content

Commit

Permalink
Add Base32
Browse files Browse the repository at this point in the history
  • Loading branch information
Joe-Abraham committed May 29, 2024
1 parent abaf323 commit b7e8aaf
Show file tree
Hide file tree
Showing 7 changed files with 193 additions and 16 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/cppcodec"]
path = third_party/cppcodec
url = https://github.com/tplgy/cppcodec.git
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -567,5 +567,8 @@ if("${TREAT_WARNINGS_AS_ERRORS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
endif()

# Add the path to cppcodec headers
include_directories(${CMAKE_SOURCE_DIR}/third_party/cppcodec)

add_subdirectory(third_party)
add_subdirectory(velox)
1 change: 1 addition & 0 deletions third_party/cppcodec
Submodule cppcodec added at 8019b8
41 changes: 41 additions & 0 deletions velox/docs/functions/presto/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,28 @@ Binary Functions

Decodes ``bigint`` value from a 64-bit 2’s complement big endian ``binary``.

.. function:: from_base32(string) -> varbinary

Decodes a Base32-encoded ``string`` back into its original binary form.
This function can handle both padded and non-padded Base32 encoded strings. Partially padded Base32 strings will result in an error.

Examples
--------
Query with padded Base32 string:
::
SELECT from_base32('JBSWY3DPEBLW64TMMQ======'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]

Query with non-padded Base32 string:
::
SELECT from_base32('JBSWY3DPEBLW64TMMQ'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]

Query with partially padded Base32 string:
::
SELECT from_base32('JBSWY3DPEBLW64TM=='); -- Error: Base32::decode() - invalid input string: length is not a multiple of 8.

In the examples above, both fully padded and non-padded Base32 strings ('JBSWY3DPEBLW64TMMQ======' and 'JBSWY3DPEBLW64TMMQ') decode to the binary representation of the text 'Hello World'.
The partially padded Base32 string 'JBSWY3DPEBLW64TM==' will lead to a decoding error.

.. function:: from_hex(string) -> varbinary

Decodes binary data from the hex encoded ``string``.
Expand Down Expand Up @@ -115,6 +137,25 @@ Binary Functions

Encodes ``bigint`` in a 64-bit 2’s complement big endian format.

.. function:: to_base32(varbinary) -> string

Encodes a binary ``varbinary`` value into its Base32 string representation.
This function generates padded Base32 strings by default.

Examples
--------
Query to encode a binary value to a padded Base32 string:
::
SELECT to_base32(ARRAY[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]); -- 'JBSWY3DPEBLW64TMMQ======'

Query to encode a binary value with fewer bytes:
::
SELECT to_base32(ARRAY[104, 101, 108, 108, 111]); -- 'NBSWY3DP'

In the above examples, the binary array `[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]` is encoded to the padded Base32 string 'JBSWY3DPEBLW64TMMQ======'.
The binary array `[104, 101, 108, 108, 111]` is encoded to 'NBSWY3DP'.


.. function:: to_hex(binary) -> varchar

Encodes ``binary`` into a hex string representation.
Expand Down
104 changes: 88 additions & 16 deletions velox/functions/prestosql/BinaryFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@

#include <folly/hash/Checksum.h>
#define XXH_INLINE_ALL
#include <cppcodec/base32_rfc4648.hpp>
#include <cppcodec/base64_rfc4648.hpp>
#include <cppcodec/base64_url.hpp>
#include <xxhash.h>

#include "folly/ssl/OpenSSLHash.h"
#include "velox/common/base/BitUtil.h"
#include "velox/common/encode/Base64.h"
#include "velox/external/md5/md5.h"
#include "velox/functions/Udf.h"
#include "velox/functions/lib/ToHex.h"
Expand Down Expand Up @@ -276,24 +278,36 @@ struct ToBase64Function {
FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varbinary>& input) {
result.resize(encoding::Base64::calculateEncodedSize(input.size()));
encoding::Base64::encode(input.data(), input.size(), result.data());
auto encoded = cppcodec::base64_rfc4648::encode<std::string>(
reinterpret_cast<const uint8_t*>(input.data()), input.size());
result.resize(encoded.size());
std::copy(encoded.begin(), encoded.end(), result.data());
}
};

template <typename T>
struct FromBase64Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE void call(
out_type<Varbinary>& result,
const arg_type<Varchar>& input) {
try {
auto inputSize = input.size();
result.resize(
encoding::Base64::calculateDecodedSize(input.data(), inputSize));
encoding::Base64::decode(
input.data(), inputSize, result.data(), result.size());
} catch (const encoding::Base64Exception& e) {
std::string inputStr = std::string(input.data(), input.size());

if (inputStr.find('=') != std::string::npos) {
// Calculate the number of padding characters needed
size_t padding = (4 - (inputStr.size() % 4)) % 4;
inputStr.append(padding, '=');
}

// Decode using cppcodec with padding
std::vector<uint8_t> decoded =
cppcodec::base64_rfc4648::decode<std::vector<uint8_t>>(inputStr);

result.resize(decoded.size());
std::copy(decoded.begin(), decoded.end(), result.data());
} catch (const cppcodec::parse_error& e) {
VELOX_USER_FAIL(e.what());
}
}
Expand All @@ -302,14 +316,28 @@ struct FromBase64Function {
template <typename T>
struct FromBase64UrlFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE void call(
out_type<Varbinary>& result,
const arg_type<Varchar>& input) {
auto inputSize = input.size();
result.resize(
encoding::Base64::calculateDecodedSize(input.data(), inputSize));
encoding::Base64::decodeUrl(
input.data(), inputSize, result.data(), result.size());
try {
std::string inputStr = std::string(input.data(), input.size());

if (inputStr.find('=') != std::string::npos) {
// Calculate the number of padding characters needed
size_t padding = (4 - (inputStr.size() % 4)) % 4;
inputStr.append(padding, '=');
}

// Decode using cppcodec with padding
std::vector<uint8_t> decoded =
cppcodec::base64_url::decode<std::vector<uint8_t>>(inputStr);

result.resize(decoded.size());
std::copy(decoded.begin(), decoded.end(), result.data());
} catch (const cppcodec::parse_error& e) {
VELOX_USER_FAIL(e.what());
}
}
};

Expand All @@ -320,8 +348,52 @@ struct ToBase64UrlFunction {
FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varbinary>& input) {
result.resize(encoding::Base64::calculateEncodedSize(input.size()));
encoding::Base64::encodeUrl(input.data(), input.size(), result.data());
auto encoded = cppcodec::base64_url::encode<std::string>(
reinterpret_cast<const uint8_t*>(input.data()), input.size());
result.resize(encoded.size());
std::copy(encoded.begin(), encoded.end(), result.data());
}
};

template <typename T>
struct ToBase32Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varbinary>& input) {
// Encode using cppcodec
std::string encoded = cppcodec::base32_rfc4648::encode(
reinterpret_cast<const uint8_t*>(input.data()), input.size());

result.resize(encoded.size());
std::copy(encoded.begin(), encoded.end(), result.data());
}
};

template <typename T>
struct FromBase32Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE void call(
out_type<Varbinary>& result,
const arg_type<Varchar>& input) {
try {
std::string inputStr = std::string(input.data(), input.size());
if (inputStr.find('=') != std::string::npos) {
// Calculate the number of padding characters needed
size_t padding = (8 - (inputStr.size() % 8)) % 8;
inputStr.append(padding, '=');
}
// Decode using cppcodec with padding
std::vector<uint8_t> decoded =
cppcodec::base32_rfc4648::decode<std::vector<uint8_t>>(inputStr);

result.resize(decoded.size());
std::copy(decoded.begin(), decoded.end(), result.data());
} catch (const cppcodec::parse_error& e) {
VELOX_USER_FAIL(e.what());
}
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ void registerSimpleFunctions(const std::string& prefix) {
{prefix + "to_base64url"});
registerFunction<FromBase64UrlFunction, Varbinary, Varchar>(
{prefix + "from_base64url"});
registerFunction<ToBase32Function, Varchar, Varbinary>(
{prefix + "to_base32"});
registerFunction<FromBase32Function, Varbinary, Varchar>(
{prefix + "from_base32"});

registerFunction<FromBigEndian32, int32_t, Varbinary>(
{prefix + "from_big_endian_32"});
Expand Down
53 changes: 53 additions & 0 deletions velox/functions/prestosql/tests/BinaryFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,59 @@ TEST_F(BinaryFunctionsTest, fromBase64Url) {
EXPECT_THROW(fromBase64Url("YQ=/"), VeloxUserError);
}

TEST_F(BinaryFunctionsTest, toBase32) {
const auto toBase32 = [&](std::optional<std::string> value) {
return evaluateOnce<std::string>("to_base32(cast(c0 as varbinary))", value);
};

EXPECT_EQ(std::nullopt, toBase32(std::nullopt));
EXPECT_EQ("", toBase32(""));
EXPECT_EQ("ME======", toBase32("a"));
EXPECT_EQ("MFRGG===", toBase32("abc"));
EXPECT_EQ("NZXQ====", toBase32("no"));
EXPECT_EQ("O5SQ====", toBase32("we"));
EXPECT_EQ("MRRDE===", toBase32("db2"));
EXPECT_EQ("MNQWWZI=", toBase32("cake"));
EXPECT_EQ("NNSWK3Q=", toBase32("keen"));
EXPECT_EQ("GEZDGNA=", toBase32("1234"));
EXPECT_EQ("NBSWY3DPEB3W64TMMQ======", toBase32("hello world"));
EXPECT_EQ(
"JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC===",
toBase32("Hello World from Velox!"));
}

TEST_F(BinaryFunctionsTest, fromBase32) {
const auto fromBase32 = [&](std::optional<std::string> value) {
return evaluateOnce<std::string>("from_base32(c0)", value);
};

EXPECT_EQ(std::nullopt, fromBase32(std::nullopt));
EXPECT_EQ("", fromBase32(""));
EXPECT_EQ("a", fromBase32("ME======"));
EXPECT_EQ("ab", fromBase32("MFRA===="));
EXPECT_EQ("abc", fromBase32("MFRGG==="));
EXPECT_EQ("db2", fromBase32("MRRDE==="));
EXPECT_EQ("abcd", fromBase32("MFRGGZA="));
EXPECT_EQ("hello world", fromBase32("NBSWY3DPEB3W64TMMQ======"));
EXPECT_EQ(
"Hello World from Velox!",
fromBase32("JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC==="));

// Try encoded strings without padding
EXPECT_EQ("a", fromBase32("ME"));
EXPECT_EQ("ab", fromBase32("MFRA"));
EXPECT_EQ("abc", fromBase32("MFRGG"));
EXPECT_EQ("db2", fromBase32("MRRDE"));
EXPECT_EQ("abcd", fromBase32("MFRGGZA"));
EXPECT_EQ("1234", fromBase32("GEZDGNA"));
EXPECT_EQ("abcde", fromBase32("MFRGGZDF"));
EXPECT_EQ("abcdef", fromBase32("MFRGGZDFMY"));

// Check with invaild encoded strings
EXPECT_THROW(fromBase32("1="), VeloxUserError);
EXPECT_THROW(fromBase32("M1======"), VeloxUserError);
}

TEST_F(BinaryFunctionsTest, fromBigEndian32) {
const auto fromBigEndian32 = [&](const std::optional<std::string>& arg) {
return evaluateOnce<int32_t>("from_big_endian_32(c0)", VARBINARY(), arg);
Expand Down

0 comments on commit b7e8aaf

Please sign in to comment.