Skip to content

Commit

Permalink
Merge pull request #1003 from trcrsired/master
Browse files Browse the repository at this point in the history
revert all the bs changes by macromodel
  • Loading branch information
trcrsired authored Dec 3, 2024
2 parents 291ef59 + 0cf4e83 commit 6a15e4a
Show file tree
Hide file tree
Showing 10 changed files with 63 additions and 240 deletions.
37 changes: 7 additions & 30 deletions benchmark/0001.refterm/Makefile
Original file line number Diff line number Diff line change
@@ -1,32 +1,9 @@
CXX = clang++
FASTIOINCLUDEPATH = -I../../include
CXXFLAGS = -Ofast -march=native -std=c++20 $(FASTIOINCLUDEPATH)
CXXEXTRAFLAGS = -flto
LDFLAGS = -fuse-ld=lld -s
CXX = clang++ FASTIOINCLUDEPATH = -I../../ include CXXFLAGS = -Ofast - march = native - std = c++ 20 $(FASTIOINCLUDEPATH)
CXXEXTRAFLAGS = -flto
LDFLAGS = -fuse - ld = lld - s

all: out_buf.exe fpipe.exe stdio.exe stdio_hack.exe iostream.exe fstream.exe filebuf_file.exe
all : out_buf.exe fpipe.exe stdio.exe stdio_hack.exe iostream.exe fstream.exe filebuf_file.exe

out_buf.exe:pch.hpp.gch out_buf.cc
$(CXX) -o out_buf.exe out_buf.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) -include pch.hpp
fpipe.exe:pch.hpp.gch fpipe.cc
$(CXX) -o fpipe.exe fpipe.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) -include pch.hpp
stdio.exe:pch.hpp.gch stdio.cc
$(CXX) -o stdio.exe stdio.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) -include pch.hpp
stdio_hack.exe:pch.hpp.gch stdio_hack.cc
$(CXX) -o stdio_hack.exe stdio_hack.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) -include pch.hpp
iostream.exe:iostreampch.hpp.gch iostream.cc
$(CXX) -o iostream.exe iostream.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) -include iostreampch.hpp
fstream.exe:filebufpch.hpp.gch fstream.cc
$(CXX) -o fstream.exe fstream.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) -include filebufpch.hpp -lntdll
filebuf_file.exe:filebufpch.hpp.gch filebuf_file.cc
$(CXX) -o filebuf_file.exe filebuf_file.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) -include filebufpch.hpp -lntdll
filebufpch.hpp.gch:filebufpch.hpp pch.hpp.gch
$(CXX) -c filebufpch.hpp $(CXXFLAGS) $(FASTIOINCLUDEPATH) -include pch.hpp
iostreampch.hpp.gch:iostreampch.hpp pch.hpp.gch
$(CXX) -c iostreampch.hpp $(CXXFLAGS) $(FASTIOINCLUDEPATH) -include pch.hpp
pch.hpp.gch:pch.hpp
$(CXX) -c pch.hpp $(CXXFLAGS) $(FASTIOINCLUDEPATH)
clean:
rm *.gch *.tmp *.txt
distclean:
rm filebuf_file.exe fpipe.exe fstream.exe iostream.exe out_buf.exe stdio.exe stdio_hack.exe *.gch *.tmp *.txt
out_buf.exe : pch.hpp.gch out_buf.cc
$(CXX) -
o out_buf.exe out_buf.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) - include pch.hpp fpipe.exe : pch.hpp.gch fpipe.cc $(CXX) - o fpipe.exe fpipe.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) - include pch.hpp stdio.exe : pch.hpp.gch stdio.cc $(CXX) - o stdio.exe stdio.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) - include pch.hpp stdio_hack.exe : pch.hpp.gch stdio_hack.cc $(CXX) - o stdio_hack.exe stdio_hack.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) - include pch.hpp iostream.exe : iostreampch.hpp.gch iostream.cc $(CXX) - o iostream.exe iostream.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) - include iostreampch.hpp fstream.exe : filebufpch.hpp.gch fstream.cc $(CXX) - o fstream.exe fstream.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) - include filebufpch.hpp - lntdll filebuf_file.exe : filebufpch.hpp.gch filebuf_file.cc $(CXX) - o filebuf_file.exe filebuf_file.cc $(CXXFLAGS) $(LDFLAGS) $(CXXEXTRAFLAGS) - include filebufpch.hpp - lntdll filebufpch.hpp.gch : filebufpch.hpp pch.hpp.gch $(CXX) - c filebufpch.hpp $(CXXFLAGS) $(FASTIOINCLUDEPATH) - include pch.hpp iostreampch.hpp.gch : iostreampch.hpp pch.hpp.gch $(CXX) - c iostreampch.hpp $(CXXFLAGS) $(FASTIOINCLUDEPATH) - include pch.hpp pch.hpp.gch : pch.hpp $(CXX) - c pch.hpp $(CXXFLAGS) $(FASTIOINCLUDEPATH) clean : rm *.gch *.tmp *.txt distclean : rm filebuf_file.exe fpipe.exe fstream.exe iostream.exe out_buf.exe stdio.exe stdio_hack.exe *.gch *.tmp *.txt
3 changes: 2 additions & 1 deletion benchmark/0001.refterm/filebuf_file.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ inline void test()
int main()
{
fast_io::u8obuf_file timer_obf(u"filebuf_file.txt");
fast_io::timer t(u8"filebuf_file");
auto t0{posix_clock_gettime(fast_io::posix_clock_id::monotonic)};
test();
print(timer_obf, posix_clock_gettime(fast_io::posix_clock_id::monotonic) - t0, u8"s\n");
}
4 changes: 2 additions & 2 deletions benchmark/0001.refterm/fpipe.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include <fast_io.h>
#include <fast_io_device.h>
#include <fast_io_driver/timer.h>
#include <fast_io_driver/refterm.h>
using namespace fast_io::io;

Expand All @@ -16,6 +15,7 @@ inline void test()
int main()
{
fast_io::u8obuf_file timer_obf(u"fpipe.txt");
fast_io::timer t(u8"fpipe");
auto t0{posix_clock_gettime(fast_io::posix_clock_id::monotonic)};
test();
print(timer_obf, posix_clock_gettime(fast_io::posix_clock_id::monotonic) - t0, u8"s\n");
}
3 changes: 2 additions & 1 deletion benchmark/0001.refterm/fstream.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ inline void test()
int main()
{
fast_io::u8obuf_file timer_obf(u"fstream.txt");
fast_io::timer t(u8"fstream");
auto t0{posix_clock_gettime(fast_io::posix_clock_id::monotonic)};
test();
print(timer_obf, posix_clock_gettime(fast_io::posix_clock_id::monotonic) - t0, u8"s\n");
}
3 changes: 2 additions & 1 deletion benchmark/0001.refterm/iostream.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ int main()
{
std::ios::sync_with_stdio(false);
fast_io::u8obuf_file timer_obf(u"iostream.txt");
fast_io::timer t(u8"iostream");
auto t0{posix_clock_gettime(fast_io::posix_clock_id::monotonic)};
test();
print(timer_obf, posix_clock_gettime(fast_io::posix_clock_id::monotonic) - t0, u8"s\n");
}
3 changes: 2 additions & 1 deletion benchmark/0001.refterm/out_buf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ inline void test()
int main()
{
fast_io::u8obuf_file timer_obf(u"out_buf.txt");
fast_io::timer t(u8"out_buf");
auto t0{posix_clock_gettime(fast_io::posix_clock_id::monotonic)};
test();
print(timer_obf, posix_clock_gettime(fast_io::posix_clock_id::monotonic) - t0, u8"s\n");
}
4 changes: 2 additions & 2 deletions benchmark/0001.refterm/stdio.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include <fast_io.h>
#include <fast_io_device.h>
#include <fast_io_driver/timer.h>
using namespace fast_io::io;

inline void test()
Expand All @@ -14,6 +13,7 @@ inline void test()
int main()
{
fast_io::u8obuf_file timer_obf(u"stdio.txt");
fast_io::timer t(u8"stdio");
auto t0{posix_clock_gettime(fast_io::posix_clock_id::monotonic)};
test();
print(timer_obf, posix_clock_gettime(fast_io::posix_clock_id::monotonic) - t0, u8"s\n");
}
3 changes: 2 additions & 1 deletion benchmark/0001.refterm/stdio_hack.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ inline void test()
int main()
{
fast_io::u8obuf_file timer_obf(u"stdio_hack.txt");
fast_io::timer t(u8"stdio_hack");
auto t0{posix_clock_gettime(fast_io::posix_clock_id::monotonic)};
test();
print(timer_obf, posix_clock_gettime(fast_io::posix_clock_id::monotonic) - t0, u8"s\n");
}
185 changes: 13 additions & 172 deletions include/fast_io_core_impl/integers/sto/sto_contiguous.h
Original file line number Diff line number Diff line change
Expand Up @@ -578,10 +578,8 @@ inline constexpr ::fast_io::freestanding::array<T, n> generate_pow_table() noexc
template <char8_t base, my_unsigned_integral T, ::std::size_t n>
inline constexpr ::fast_io::freestanding::array<T, n> pow_table_n{::fast_io::details::generate_pow_table<base, T, n>()};


template <char8_t base, ::std::integral char_type, my_unsigned_integral T>
#if defined(__SSE4_1__) && __has_cpp_attribute(__gnu__::__cold__) && (defined(__x86_64__) || defined(_M_AMD64))
[[__gnu__::__cold__]]
#endif
inline constexpr parse_result<char_type const *>
scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, char_type const *last, T &res) noexcept
{
Expand Down Expand Up @@ -610,11 +608,6 @@ scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, cha
constexpr bool isebcdic{::fast_io::details::is_ebcdic<char_type>};
if constexpr (!isebcdic && (::std::numeric_limits<::std::uint_least64_t>::digits == 64u) && false)
{
// Inspired by:
// https://github.com/fastfloat/fast_float
// Implementation of higher performance (Binary to Hexadecimal):
// Optimize both fixed range and infinite range (suitable for scan)

if constexpr (sizeof(::std::uint_least32_t) < sizeof(::std::size_t))
{
if constexpr (base_char_type <= 10)
Expand Down Expand Up @@ -699,7 +692,7 @@ scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, cha

if (::std::uint_least32_t const cval{((val + first_bound) | (val - 0x30303030)) & 0x80808080}; cval)
{
auto const ctrz_cval{::std::countr_zero(cval)};
auto ctrz_cval{::std::countr_zero(cval)};
auto const valid_bits{ctrz_cval & -8};

if (!valid_bits) [[unlikely]]
Expand All @@ -723,10 +716,10 @@ scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, cha
val = (val * base_char_type) + (val >> 8);
val = (((val & mask) * pow_base_sizeof_base_2) + ((val >> 16) & mask));

constexpr auto pow_table{generate_pow_table<base_char_type, ::std::uint_least32_t, 4>()};
res = static_cast<T>(res * pow_table[ctrz_cval / (8 * sizeof(char_type))] + val);
ctrz_cval >= shifter;
res = static_cast<T>(res * ::fast_io::details::pow_table_n<base_char_type, ::std::uint_least32_t, 4>.index_unchecked(ctrz_cval) + val);

first += ctrz_cval / (8 * sizeof(char_type));
first += ctrz_cval;


return scan_int_contiguous_none_simd_space_part_check_overflow_impl<base, char_type, T>(first, last, res);
Expand Down Expand Up @@ -764,7 +757,7 @@ scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, cha

if (::std::uint_least64_t const cval{((val + first_bound) | (val - 0x0030003000300030)) & 0x8000800080008000}; cval)
{
auto const ctrz_cval{::std::countr_zero(cval)};
auto ctrz_cval{::std::countr_zero(cval)};
auto const valid_bits{ctrz_cval & -16};

if (!valid_bits) [[unlikely]]
Expand All @@ -787,11 +780,11 @@ scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, cha
val -= 0x0030003000300030;
val = (val * base_char_type) + (val >> 16);
val = (((val & mask) * pow_base_sizeof_base_2) + ((val >> 32) & mask));
constexpr auto pow_table{generate_pow_table<base_char_type, ::std::uint_least64_t, 4>()};

res = static_cast<T>(res * pow_table[ctrz_cval / (8 * sizeof(char_type))] + val);
ctrz_cval >>= shifter;
res = static_cast<T>(res * ::fast_io::details::pow_table_n<base_char_type, ::std::uint_least64_t, 4>.index_unchecked(ctrz_cval) + val);

first += ctrz_cval / (8 * sizeof(char_type));
first += ctrz_cval;

return scan_int_contiguous_none_simd_space_part_check_overflow_impl<base, char_type, T>(first, last, res);
}
Expand Down Expand Up @@ -833,7 +826,7 @@ scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, cha
0x8080808080808080};
cval)
{
auto const ctrz_cval{::std::countr_zero(cval)};
auto ctrz_cval{::std::countr_zero(cval)};
auto const valid_bits{ctrz_cval & -8};

if (!valid_bits) [[unlikely]]
Expand Down Expand Up @@ -861,11 +854,10 @@ scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, cha
val = (val * base_char_type) + (val >> 8);
val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;

constexpr auto pow_table{generate_pow_table<base_char_type, ::std::uint_least64_t, 8>()};

res = static_cast<T>(res * pow_table[ctrz_cval / (8 * sizeof(char_type))] + val);
ctrz_cval >>= shifter;

first += ctrz_cval / (8 * sizeof(char_type));
res = static_cast<T>(res * ::fast_io::details::pow_table_n<base_char_type, ::std::uint_least64_t, 8>.index_unchecked(ctrz_cval) + val);
first += ctrz_cval;

return scan_int_contiguous_none_simd_space_part_check_overflow_impl<base, char_type, T>(first, last, res);
}
Expand All @@ -886,80 +878,6 @@ scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, cha
first += sizeof(::std::uint_least64_t);
}
}

// It seems that it is unnecessary to read 4-bit data in base11-hexadecimal system,
// and the time for 4-bit data reading of this algorithm even exceeds 4 times of cyclic reading
#if 0
if constexpr (max_size >= sizeof(::std::uint_least32_t))
{
if (static_cast<::std::size_t>(first_phase_last - first) >= sizeof(::std::uint_least32_t))
{
::std::uint_least32_t val;
::fast_io::freestanding::my_memcpy(__builtin_addressof(val), first, sizeof(::std::uint_least32_t));

val = ::fast_io::little_endian(val);

constexpr ::std::uint_least32_t pow_base_sizeof_u32{::fast_io::details::compile_time_pow<::std::uint_least32_t>(static_cast<::std::uint_least32_t>(base_char_type), sizeof(::std::uint_least32_t))};
constexpr ::std::uint_least32_t first_bound1{0x39393939 + 0x01010101 * (16 - base_char_type)};
constexpr ::std::uint_least32_t first_bound2{0x19191919 + 0x01010101 * (16 - base_char_type)};

if (::std::uint_least32_t const cval{((((val + 0x46464646) | (val - 0x30303030)) &
((val + first_bound1) | (val - 0x40404040)) &
((val + first_bound2) | (val - 0x60606060))) |
~(((val + 0x3f3f3f3f) | (val - 0x40404040)) &
((val + 0x1f1f1f1f) | (val - 0x60606060)))) &
0x80808080};
cval)
{
auto const ctrz_cval{::std::countr_zero(cval)};
auto const valid_bits{ctrz_cval & -8};

if (!valid_bits) [[unlikely]]
{
return scan_int_contiguous_none_simd_space_part_check_overflow_impl<base, char_type, T>(first, last, res);
}

val <<= 32 - valid_bits;

::std::uint_least32_t all_zero{0x30303030};

all_zero >>= valid_bits;

val |= all_zero;

constexpr ::std::uint_least32_t pow_base_sizeof_base_2{::fast_io::details::compile_time_pow<::std::uint_least32_t>(static_cast<::std::uint_least32_t>(base_char_type), 2)};

constexpr ::std::uint_least32_t mask{0x000000FF};

val -= 0x30303030;
val = (val & 0x0f0f0f0f) + ((val & 0x10101010) >> 4) * 9;
val = (val * base_char_type) + (val >> 8);
val = (((val & mask) * pow_base_sizeof_base_2) + ((val >> 16) & mask));

constexpr auto pow_table{generate_pow_table<base_char_type, ::std::uint_least32_t, 4>()};

res = static_cast<T>(res * pow_table[ctrz_cval / (8 * sizeof(char_type))] + val);

first += ctrz_cval / (8 * sizeof(char_type));

return scan_int_contiguous_none_simd_space_part_check_overflow_impl<base, char_type, T>(first, last, res);
}
else
{
constexpr ::std::uint_least32_t pow_base_sizeof_base_2{::fast_io::details::compile_time_pow<::std::uint_least32_t>(static_cast<::std::uint_least32_t>(base_char_type), 2)};

constexpr ::std::uint_least32_t mask{0x000000FF};

val -= 0x30303030;
val = (val & 0x0f0f0f0f) + ((val & 0x10101010) >> 4) * 9;
val = (val * base_char_type) + (val >> 8);
val = (((val & mask) * pow_base_sizeof_base_2) + ((val >> 16) & mask));
res = static_cast<T>(res * pow_base_sizeof_u32 + val);
first += sizeof(::std::uint_least32_t);
}
}
}
#endif
}
}
}
Expand Down Expand Up @@ -1028,83 +946,6 @@ scan_int_contiguous_none_simd_space_part_define_impl(char_type const *first, cha
}
}
}
else if constexpr (base_char_type <= 16)
{
if constexpr (sizeof(char_type) == sizeof(char8_t))
{
// It seems that it is unnecessary to read 4-bit data in base11-hexadecimal system,
// and the time for 4-bit data reading of this algorithm even exceeds 4 times of cyclic reading
#if 0
if constexpr (max_size >= sizeof(::std::uint_least32_t))
{
while (static_cast<::std::size_t>(first_phase_last - first) >= sizeof(::std::uint_least32_t))
{
::std::uint_least32_t val;
::fast_io::freestanding::my_memcpy(__builtin_addressof(val), first, sizeof(::std::uint_least32_t));

val = ::fast_io::little_endian(val);

constexpr ::std::uint_least32_t pow_base_sizeof_u32{::fast_io::details::compile_time_pow<::std::uint_least32_t>(static_cast<::std::uint_least32_t>(base_char_type), sizeof(::std::uint_least32_t))};
constexpr ::std::uint_least32_t first_bound1{0x39393939 + 0x01010101 * (16 - base_char_type)};
constexpr ::std::uint_least32_t first_bound2{0x19191919 + 0x01010101 * (16 - base_char_type)};

if (::std::uint_least32_t const cval{((((val + 0x46464646) | (val - 0x30303030)) &
((val + first_bound1) | (val - 0x40404040)) &
((val + first_bound2) | (val - 0x60606060))) |
~(((val + 0x3f3f3f3f) | (val - 0x40404040)) &
((val + 0x1f1f1f1f) | (val - 0x60606060)))) &
0x80808080};
cval)
{
auto const ctrz_cval{::std::countr_zero(cval)};
auto const valid_bits{ctrz_cval & -8};

if (!valid_bits) [[unlikely]]
{
return scan_int_contiguous_none_simd_space_part_check_overflow_impl<base, char_type, T>(first, last, res);
}

val <<= 32 - valid_bits;

::std::uint_least32_t all_zero{0x30303030};

all_zero >>= valid_bits;

val |= all_zero;

constexpr ::std::uint_least32_t pow_base_sizeof_base_2{::fast_io::details::compile_time_pow<::std::uint_least32_t>(static_cast<::std::uint_least32_t>(base_char_type), 2)};

constexpr ::std::uint_least32_t mask{0x000000FF};

val -= 0x30303030;
val = (val & 0x0f0f0f0f) + ((val & 0x10101010) >> 4) * 9;
val = (val * base_char_type) + (val >> 8);
val = (((val & mask) * pow_base_sizeof_base_2) + ((val >> 16) & mask));

constexpr auto pow_table{generate_pow_table<base_char_type, ::std::uint_least32_t, 4>()};
res = static_cast<T>(res * pow_table[ctrz_cval / (8 * sizeof(char_type))] + val);

first += ctrz_cval / (8 * sizeof(char_type));

return scan_int_contiguous_none_simd_space_part_check_overflow_impl<base, char_type, T>(first, last, res);
}

constexpr ::std::uint_least32_t pow_base_sizeof_base_2{::fast_io::details::compile_time_pow<::std::uint_least32_t>(static_cast<::std::uint_least32_t>(base_char_type), 2)};

constexpr ::std::uint_least32_t mask{0x000000FF};
val -= 0x30303030;

val = (val & 0x0f0f0f0f) + ((val & 0x10101010) >> 4) * 9;

val = (val * base_char_type) + (val >> 8);
val = (((val & mask) * pow_base_sizeof_base_2) + ((val >> 16) & mask));
res = static_cast<T>(res * pow_base_sizeof_u32 + val);
first += sizeof(::std::uint_least32_t);
}
}
#endif
}
}
}
}
}
Expand Down
Loading

0 comments on commit 6a15e4a

Please sign in to comment.