Merge branch 'frontend'

Snektron · Jul 10, 2021 · 365e0ae · 365e0ae
2 parents bf0ed45 + a319492
commit 365e0ae
Show file tree

Hide file tree

Showing 23 changed files with 913 additions and 169 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,3 @@
-lib
 build
 subprojects/*
 !subprojects/*.wrap
diff --git a/include/pareas/compiler/frontend.hpp b/include/pareas/compiler/frontend.hpp
@@ -4,7 +4,7 @@
 #include "futhark_generated.h"
 
 #include "pareas/compiler/ast.hpp"
-#include "pareas/compiler/profiler.hpp"
+#include "pareas/profiler/profiler.hpp"
 
 #include <chrono>
 #include <stdexcept>
@@ -33,7 +33,7 @@ namespace frontend {
             std::runtime_error(error_name(e)) {}
     };
 
-    DeviceAst compile(futhark_context* ctx, const std::string& input, Profiler& p);
+    DeviceAst compile(futhark_context* ctx, const std::string& input, pareas::Profiler& p);
 }
 
 #endif
diff --git a/include/pareas/compiler/profiler.hpp b/include/pareas/compiler/profiler.hpp
diff --git a/include/pareas/profiler/profiler.hpp b/include/pareas/profiler/profiler.hpp
@@ -0,0 +1,48 @@
+#ifndef _PAREAS_PROFILER_PROFILER_HPP
+#define _PAREAS_PROFILER_PROFILER_HPP
+
+#include <iosfwd>
+#include <chrono>
+#include <vector>
+#include <functional>
+
+namespace pareas {
+    struct Profiler {
+        using SyncCallback = std::function<void()>;
+
+        using Clock = std::chrono::high_resolution_clock;
+
+        struct HistoryEntry {
+            unsigned level;
+            const char* name;
+            Clock::duration elapsed;
+        };
+
+        unsigned max_level;
+        unsigned level;
+
+        SyncCallback sync_callback;
+        std::vector<Clock::time_point> starts;
+        std::vector<HistoryEntry> history;
+
+        Profiler(unsigned max_level);
+
+        void set_sync_callback(SyncCallback sync_callback = null_callback);
+
+        void begin();
+        void end(const char* name);
+
+        void dump(std::ostream& os);
+
+        template <typename F>
+        void measure(const char* name, F f) {
+            this->begin();
+            f();
+            this->end(name);
+        }
+
+        static void null_callback() {}
+    };
+}
+
+#endif
diff --git a/lib/github.com/diku-dk/segmented/segmented.fut b/lib/github.com/diku-dk/segmented/segmented.fut
@@ -0,0 +1,103 @@
+-- | Irregular segmented operations, like scans and reductions.
+
+-- | Segmented scan. Given a binary associative operator ``op`` with
+-- neutral element ``ne``, computes the inclusive prefix scan of the
+-- segments of ``as`` specified by the ``flags`` array, where `true`
+-- starts a segment and `false` continues a segment.
+let segmented_scan [n] 't (op: t -> t -> t) (ne: t)
+                          (flags: [n]bool) (as: [n]t): [n]t =
+  (unzip (scan (\(x_flag,x) (y_flag,y) ->
+                (x_flag || y_flag,
+                 if y_flag then y else x `op` y))
+          (false, ne)
+          (zip flags as))).1
+
+-- | Segmented reduction. Given a binary associative operator ``op``
+-- with neutral element ``ne``, computes the reduction of the segments
+-- of ``as`` specified by the ``flags`` array, where `true` starts a
+-- segment and `false` continues a segment.  One value is returned per
+-- segment.
+let segmented_reduce [n] 't (op: t -> t -> t) (ne: t)
+                            (flags: [n]bool) (as: [n]t) =
+  -- Compute segmented scan.  Then we just have to fish out the end of
+  -- each segment.
+  let as' = segmented_scan op ne flags as
+  -- Find the segment ends.
+  let segment_ends = rotate 1 flags
+  -- Find the offset for each segment end.
+  let segment_end_offsets = segment_ends |> map i64.bool |> scan (+) 0
+  let num_segments = if n > 0 then last segment_end_offsets else 0
+  -- Make room for the final result.  The specific value we write here
+  -- does not matter; they will all be overwritten by the segment
+  -- ends.
+  let scratch = replicate num_segments ne
+  -- Compute where to write each element of as'.  Only segment ends
+  -- are written.
+  let index i f = if f then i-1 else -1
+  in scatter scratch (map2 index segment_end_offsets segment_ends) as'
+
+-- | Replicated iota. Given a repetition array, the function returns
+-- an array with each index (starting from 0) repeated according to
+-- the repetition array. As an example, replicated_iota [2,3,1]
+-- returns the array [0,0,1,1,1,2].
+
+let replicated_iota [n] (reps:[n]i64) : []i64 =
+  let s1 = scan (+) 0 reps
+  let s2 = map2 (\i x -> if i==0 then 0 else x)
+                (iota n) (rotate (-1) s1)
+  let tmp = reduce_by_index (replicate (reduce (+) 0 reps) 0) i64.max 0 s2 (iota n)
+  let flags = map (>0) tmp
+  in segmented_scan (+) 0 flags tmp
+
+-- | Segmented iota. Given a flags array, the function returns an
+-- array of index sequences, each of which is reset according to the
+-- flags array. As an examples, segmented_iota
+-- [false,false,false,true,false,false,false] returns the array
+-- [0,1,2,0,1,2,3].
+
+let segmented_iota [n] (flags:[n]bool) : [n]i64 =
+  let iotas = segmented_scan (+) 0 flags (replicate n 1)
+  in map (\x -> x-1) iotas
+
+-- | Generic expansion function. The function expands a source array
+-- into a target array given (1) a function that determines, for each
+-- source element, how many target elements it expands to and (2) a
+-- function that computes a particular target element based on a
+-- source element and the target element number associated with the
+-- source. As an example, the expression expand (\x->x) (*) [2,3,1]
+-- returns the array [0,2,0,3,6,0].
+
+let expand 'a 'b (sz: a -> i64) (get: a -> i64 -> b) (arr:[]a) : []b =
+  let szs = map sz arr
+  let idxs = replicated_iota szs
+  let iotas = segmented_iota (map2 (!=) idxs (rotate (-1) idxs))
+  in map2 (\i j -> get arr[i] j) idxs iotas
+
+-- | Expansion function equivalent to performing a segmented reduction
+-- to the result of a general expansion with a flags vector expressing
+-- the beginning of the expanded segments. The function makes use of
+-- the intermediate flags vector generated as part of the expansion
+-- and the `expand_reduce` function is therefore more efficient than
+-- if a segmented reduction (with an appropriate flags vector) is
+-- explicitly followed by a call to expand.
+
+let expand_reduce 'a 'b (sz: a -> i64) (get: a -> i64 -> b)
+                        (op: b -> b -> b) (ne:b) (arr:[]a) : []b =
+  let szs = map sz arr
+  let idxs = replicated_iota szs
+  let flags = map2 (!=) idxs (rotate (-1) idxs)
+  let iotas = segmented_iota flags
+  let vs = map2 (\i j -> get arr[i] j) idxs iotas
+  in segmented_reduce op ne flags vs
+
+-- | Expansion followed by an ''outer segmented reduce'' that ensures
+-- that each element in the result array corresponds to expanding and
+-- reducing the corresponding element in the source array.
+
+let expand_outer_reduce 'a 'b [n] (sz: a -> i64) (get: a -> i64 -> b)
+                                  (op: b -> b -> b) (ne: b)
+                                  (arr: [n]a) : [n]b =
+  let sz' x = let s = sz x
+              in if s == 0 then 1 else s
+  let get' x i = if sz x == 0 then ne else get x i
+  in expand_reduce sz' get' op ne arr :> [n]b
diff --git a/lib/github.com/diku-dk/segmented/segmented_tests.fut b/lib/github.com/diku-dk/segmented/segmented_tests.fut
@@ -0,0 +1,74 @@
+-- | ignore
+
+import "segmented"
+
+-- ==
+-- entry: test_segmented_scan
+-- input { [true,false,false,true,false,false,true,false,false,false]
+--         [1i64,2i64,3i64,4i64,5i64,6i64,7i64,8i64,9i64,10i64] }
+-- output { [1i64,3i64,6i64,4i64,9i64,15i64,7i64,15i64,24i64,34i64] }
+-- input { [true] [1i64] }
+-- output { [1i64] }
+-- input { empty([0]bool) empty([0]i64) }
+-- output { empty([0]i64) }
+
+entry test_segmented_scan (flags: []bool) (as: []i64) =
+  segmented_scan (+) 0 flags as
+
+-- ==
+-- entry: test_segmented_reduce
+-- input { [true,false,false,true,false,false,true,false,false,false]
+--         [1i64,2i64,3i64,4i64,5i64,6i64,7i64,8i64,9i64,10i64] }
+-- output { [6i64,15i64,34i64] }
+-- input { [true] [1i64] }
+-- output { [1i64] }
+
+entry test_segmented_reduce (flags: []bool) (as: []i64) =
+  segmented_reduce (+) 0 flags as
+
+-- ==
+-- entry: test_replicated_iota
+-- input { [2i64,3i64,1i64] } output { [0i64,0i64,1i64,1i64,1i64,2i64] }
+-- input { [3i64] } output { [0i64,0i64,0i64] }
+-- input { [2i64,0i64,1i64] } output { [0i64,0i64,2i64] }
+-- input { empty([0]i64) } output { empty([0]i64) }
+-- input { [0i64] } output { empty([0]i64) }
+-- input { [0i64,0i64] } output { empty([0]i64) }
+
+entry test_replicated_iota (repl:[]i64) : []i64 =
+  replicated_iota repl
+
+-- ==
+-- entry: test_segmented_iota
+-- input { [false,false,false,true,false,false,false] }
+-- output { [0i64,1i64,2i64,0i64,1i64,2i64,3i64] }
+-- input { [false] } output { [0i64] }
+-- input { [true] } output { [0i64] }
+-- input { empty([0]bool) } output { empty([0]i64) }
+
+entry test_segmented_iota (flags:[]bool) : []i64 =
+  segmented_iota flags
+
+-- ==
+-- entry: test_expand
+-- input { [2i64,3i64,1i64] }
+-- output { [0i64,2i64,0i64,3i64,6i64,0i64] }
+
+entry test_expand (arr:[]i64) : []i64 =
+  expand (\ x -> x) (\x i -> x*i) arr
+
+-- ==
+-- entry: test_expand_reduce
+-- input { [2i64,0i64,3i64,1i64] }
+-- output { [2i64,9i64,0i64] }
+
+entry test_expand_reduce (arr:[]i64) : []i64 =
+  expand_reduce (\ x -> x) (\x i -> x*i) (+) 0 arr
+
+-- ==
+-- entry: test_expand_outer_reduce
+-- input { [2i64,0i64,3i64,1i64] }
+-- output { [2i64,0i64,9i64,0i64] }
+
+entry test_expand_outer_reduce (arr:[]i64) : []i64 =
+  expand_outer_reduce (\ x -> x) (\x i -> x*i) (+) 0 arr
diff --git a/lib/github.com/diku-dk/sorts/.gitignore b/lib/github.com/diku-dk/sorts/.gitignore
@@ -0,0 +1,3 @@
+*
+!*.fut
+!.gitignore
diff --git a/lib/github.com/diku-dk/sorts/bubble_sort.fut b/lib/github.com/diku-dk/sorts/bubble_sort.fut
@@ -0,0 +1,25 @@
+-- | Parallel bubble sort.
+--
+-- This may be useful if you have almost-sorted data that you want to
+-- make fully-sorted in parallel.  Obviously *very* slow for
+-- non-sorted data.
+
+-- | Parallel bubble sort.  Runs with *O(n^2)* work and *O(n^2)* depth.
+let bubble_sort [n] 't ((<=): t -> t -> bool) (xs: [n]t): [n]t =
+  let f b xs i =
+    let dir = if i%2 == 0 then b else -b
+    let j = i + dir
+    let cmp x y = if dir == 1 then x <= y
+                  else ! (x <= y)
+    in if j >= 0 && j < n && (xs[j] `cmp` xs[i])
+       then (true, xs[j]) else (false, xs[i])
+  let iter xs b =
+    let (changed, xs) = tabulate n (f b xs) |> unzip
+    in (xs, -b, or changed)
+  in (loop (xs, b, continue) = (xs, 1, true) while continue do iter xs b).0
+
+-- | Like `bubble_sort`@term, but sort based on key function.
+let bubble_sort_by_key [n] 't 'k (key: t -> k) ((<=): k -> k -> bool) (xs: [n]t): [n]t =
+  zip (map key xs) (iota n)
+  |> bubble_sort (\(x, _) (y, _) -> x <= y)
+  |> map (\(_, i) -> xs[i])
diff --git a/lib/github.com/diku-dk/sorts/bubble_sort_tests.fut b/lib/github.com/diku-dk/sorts/bubble_sort_tests.fut
@@ -0,0 +1,48 @@
+-- | ignore
+
+import "bubble_sort"
+
+-- ==
+-- entry: sort_i32
+-- input { empty([0]i32) }
+-- output { empty([0]i32) }
+-- input { [5,4,3,2,1] }
+-- output { [1,2,3,4,5] }
+-- input { [5,4,3,3,2,1] }
+-- output { [1,2,3,3,4,5] }
+
+entry sort_i32 (xs: []i32) = bubble_sort (i32.<=) xs
+
+-- ==
+-- entry: sort_u16
+-- input { [5u16,4u16,3u16,2u16,1u16] }
+-- output { [1u16,2u16,3u16,4u16,5u16] }
+
+entry sort_u16 (xs: []u16) = bubble_sort (u16.<=) xs
+
+-- ==
+-- entry: sort_f32
+-- input { [5f32,4f32,3f32,2f32,1f32] }
+-- output { [1f32,2f32,3f32,4f32,5f32] }
+
+entry sort_f32 (xs: []f32) = bubble_sort (f32.<=) xs
+
+-- ==
+-- entry: sort_perm_i32
+-- input { [5,4,3,2,1,0,-1,-2] }
+-- output { [7, 6, 5, 4, 3, 2, 1, 0] }
+
+entry sort_perm_i32 [n] (xs: [n]i32) =
+  zip xs (iota n)
+  |> bubble_sort_by_key (.0) (<=)
+  |> map ((.1) >-> i32.i64)
+
+-- ==
+-- entry: sort_perm_f32
+-- input { [5f32,4f32,3f32,2f32,1f32,0f32,-1f32,-2f32] }
+-- output { [7, 6, 5, 4, 3, 2, 1, 0] }
+
+entry sort_perm_f32 [n] (xs: [n]f32) =
+  zip xs (iota n)
+  |> bubble_sort_by_key (.0) (<=)
+  |> map ((.1) >-> i32.i64)