Publications.bib

@Comment{{
We try to keep the formatting in this bibtex file consistent. Please
try to follow the following style guide.

- Order: The entries of this file are ordered by year of appearance and
          then by the bibtex tags (newest entries at the top).
- Keys:  Use the style firstauthor.lastname + year + optional-tag.
         E.g. [Feautrier1992multi]
- '{}': Use a single pair of braces and embrace individual words/letters
         that should always remain uppercase.
- Abbreviations: Do not abbreviate conferences and journal names.
- Abstracts: Include abstracts, if available.
- ACM style: For all remaining style issues, we to follow the style used by ACM
         (see e.g., Baskaran2009)

!! The style rules are necessarily incomplete, if you would like to improve
   the style of this file, feel free to provide a patch that both extends
   the style guide and fixes the existing entries.
}}

@inproceedings{Alias2021dpn,
  title={Data-aware process networks},
  author={Alias, Christophe and Plesco, Alexandru},
  booktitle={Proceedings of the 30th ACM SIGPLAN International Conference on Compiler Construction},
  pages={1--11},
  year={2021},
  keywords = {High-Level Synthesis, FPGA, Automatic Parallelization, Polyhedral Model}
}

@inproceedings{Baghdadi:2019:TPC:3314872.3314896,
 author = {Baghdadi, Riyadh and Ray, Jessica and Romdhane, Malek Ben and Del Sozzo, Emanuele and Akkas, Abdurrahman and Zhang, Yunming and Suriana, Patricia and Kamil, Shoaib and Amarasinghe, Saman},
 title = {Tiramisu: A Polyhedral Compiler for Expressing Fast and Portable Code},
 booktitle = {Proceedings of the 2019 IEEE/ACM International Symposium on Code Generation and Optimization},
 series = {CGO 2019},
 year = {2019},
 isbn = {978-1-7281-1436-1},
 location = {Washington, DC, USA},
 pages = {193--205},
 numpages = {13},
 url = {https://arxiv.org/pdf/1804.10694.pdf},
 acmid = {3314896},
 publisher = {IEEE Press},
 address = {Piscataway, NJ, USA},
 keywords = {Code Generation, Code Optimization, Deep Learning, Distributed Systems, GPU, Polyhedral Model, Tensors}
} 

@INPROCEEDINGS{7429301,
author={R. {Baghdadi} and U. {Beaugnon} and A. {Cohen} and T. {Grosser} and M. {Kruse} and C. {Reddy} and S. {Verdoolaege} and A. {Betts} and A. F. {Donaldson} and J. {Ketema} and J. {Absar} and S. v. {Haastregt} and A. {Kravets} and A. {Lokhmotov} and R. {David} and E. {Hajiyev}},
booktitle={2015 International Conference on Parallel Architecture and Compilation (PACT)},
title={PENCIL: A Platform-Neutral Compute Intermediate Language for Accelerator Programming},
year={2015},
volume={},
number={},
pages={138-149},
keywords={application program interfaces;graphics processing units;parallel architectures;parallel programming;program compilers;specification languages;platform-neutral compute intermediate language;accelerator programming;GPUs;low-level APIs;CUDA;automatic parallelization;domain specific languages;performance portability;GNU C99;portable implementation language;DSL compilers;PENCIL-to-OpenCL backend;polyhedral compiler;data-dependent control flow;nonaffine array accesses;image processing kernels;Rodinia suites;SHOC suites;DSL embedding scenarios;linear algebra;signal processing radar applications;SpearDE;AMD Radeon HD 5670 GPU platform;R9 285 GPU platform;NVIDIA GTX 470 GPU platform;ARM Mali-T604 GPU platform;DSL;Optimization;Kernel;Image processing;Graphics processing units;Benchmark testing;Arrays;automatic optimization;intermediate language;polyhedral model;domain specific languages;OpenCL},
doi={10.1109/PACT.2015.17},
ISSN={1089-795X},
month={Oct},
 url = {https://ieeexplore.ieee.org/document/7429301}
 }

@article{Baghdadi:2013:ILT:2400682.2400711,
 author = {Baghdadi, Riyadh and Cohen, Albert and Verdoolaege, Sven and Trifunovi\'{c}, Konrad},
 title = {Improved Loop Tiling Based on the Removal of Spurious False Dependences},
 journal = {ACM Trans. Archit. Code Optim.},
 issue_date = {January 2013},
 volume = {9},
 number = {4},
 month = jan,
 year = {2013},
 issn = {1544-3566},
 pages = {52:1--52:26},
 articleno = {52},
 numpages = {26},
 url = {http://doi.acm.org/10.1145/2400682.2400711},
 doi = {10.1145/2400682.2400711},
 acmid = {2400711},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Tiling, compiler, expansion, false dependences, memory-based dependences}
}

@techreport{baghdadi:hal-01154812,
  TITLE = {{PENCIL Language Specification}},
  AUTHOR = {Baghdadi, Riyadh and Cohen, Albert and Grosser, Tobias and Verdoolaege, Sven and Lokhmotov, Anton and Absar, Javed and Van Haastregt, Sven and Kravets, Alexey and Donaldson, Alastair},
  URL = {https://hal.inria.fr/hal-01154812},
  TYPE = {Research Report},
  NUMBER = {RR-8706},
  PAGES = {37},
  INSTITUTION = {{INRIA}},
  YEAR = {2015},
  MONTH = May,
  KEYWORDS = {PENCIL ; DSL ; Accelerator ; Domain Specific Language ; Intermediate Language ; OpenCL},
  PDF = {https://hal.inria.fr/hal-01154812/file/RR-8706.pdf},
  HAL_ID = {hal-01154812},
  HAL_VERSION = {v3}
}

@article{Sukumaran-Rajam2015NonLinearLoops,
 author = {Sukumaran-Rajam, Aravind and Clauss, Philippe},
 title = {The Polyhedral Model of Nonlinear Loops},
 journal = {ACM Transactions on Architecture and Code Optimization},
 issue_date = {January 2016},
 volume = {12},
 number = {4},
 month = dec,
 year = {2015},
 issn = {1544-3566},
 pages = {48:1--48:27},
 articleno = {48},
 numpages = {27},
 url = {http://doi.acm.org/10.1145/2838734},
 doi = {10.1145/2838734},
 acmid = {2838734},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Speculative and dynamic loop parallelization, nonlinear memory references, polyhedral model},
 abstract = {
Runtime code optimization and speculative execution are becoming increasingly
prominent to leverage performance in the current multi- and many-core era.
However, a wider and more efficient use of such techniques is mainly hampered
by the prohibitive time overhead induced by centralized data race detection,
dynamic code behavior modeling, and code generation. Most of the existing
Thread Level Speculation (TLS) systems rely on naively slicing the target loops
into chunks and trying to execute the chunks in parallel with the help of a
centralized performance-penalizing verification module that takes care of data
races. Due to the lack of a data dependence model, these speculative systems
are not capable of doing advanced transformations, and, more importantly, the
chances of rollback are high. The polyhedral model is a well-known mathematical
model to analyze and optimize loop nests. The current state-of-art tools limit
the application of the polyhedral model to static control codes. Thus, none of
these tools can generally handle codes with while loops, indirect memory
accesses, or pointers. Apollo (Automatic POLyhedral Loop Optimizer) is a
framework that goes one step beyond and applies the polyhedral model
dynamically by using TLS. Apollo can predict, at runtime, whether the codes are
behaving linearly or not, and it applies polyhedral transformations on-the-fly.
This article presents a novel system that enables Apollo to handle codes whose
memory accesses and loop bounds are not necessarily linear. More generally,
this approach expands the applicability of the polyhedral model at runtime to a
wider class of codes. Plugging together both linear and nonlinear accesses to
the dependence prediction model enables the application of polyhedral loop
optimizing transformations even for nonlinear code kernels while also allowing
a low-cost speculation verification.
}
}

@inproceedings{Mullapudi2015asplos,
 author    = {Mullapudi, Ravi Teja and
              Vasista, Vinay and
              Bondhugula, Uday.},
 title     = {PolyMage: Automatic Optimization for Image Processing Pipelines},
 booktitle = {International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)},
 year      = {2015},
 url = {http://mcl.csa.iisc.ernet.in/polymage.html},
 abstract = {
    This paper presents the design and implementation of PolyMage, a
    domain-specific language and compiler for image processing pipelines.
    An image processing pipeline can be viewed as a graph of interconnected
    stages which process images successively. Each stage typically performs
    one of point-wise, stencil, reduction or data-dependent operations on
    image pixels. Individual stages in a pipeline typically exhibit abundant
    data parallelism that can be exploited with relative ease. However, the
    stages also require high memory bandwidth preventing effective
    utilization of parallelism available on modern architectures. For
    applications that demand high performance, the traditional options are
    to use optimized libraries like OpenCV or to optimize manually.  While
    using libraries precludes optimization across library routines, manual
    optimization accounting for both parallelism and locality is very
    tedious.

    The focus of our system, PolyMage, is on automatically generating
    high-performance implementations of image processing pipelines expressed
    in a high-level declarative language. Our optimization approach
    primarily relies on the transformation and code generation capabilities
    of the polyhedral compiler framework. To the best of our knowledge, this
    is the first model-driven compiler for image processing pipelines that
    performs complex fusion, tiling, and storage optimization automatically.
    Experimental results on a modern multicore system show that the
    performance achieved by our automatic approach is up to 1.81$\times$
    better than that achieved through manual tuning in Halide, a
    state-of-the-art language and compiler for image processing pipelines.
    For a camera raw image processing pipeline, our performance is
    comparable to that of a hand-tuned implementation.}
}

@inproceedings{Acharya2015ppopp,
 author    = {Acharya, Aravind and
              Bondhugula, Uday.},
 title     = {Pluto+: Near-Complete Modeling of Affine Transformations for Parallelism and Locality},
 booktitle = {ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)},
 year      = {2015},
 url = {http://mcl.csa.iisc.ernet.in/downloads/publications/acharya15ppopp.pdf},
 abstract = {
    Affine transformations have proven to be very powerful for loop restructuring
    due to their ability to model a very wide range of transformations.  A
    single multi-dimensional affine function can represent a long and complex
    sequence of simpler transformations. Existing affine transformation
    frameworks like the Pluto algorithm, that include a cost function for
    modern multicore architectures where coarse-grained parallelism and
    locality are crucial, consider only a sub-space of transformations to avoid
    a combinatorial explosion in finding the transformations. The ensuing
    practical tradeoffs lead to the exclusion of certain useful
    transformations, in particular, transformation compositions involving loop
    reversals and loop skewing by negative factors. In this paper, we propose
    an approach to address this limitation by modeling a much larger space of
    affine transformations in conjunction with the Pluto algorithm's cost
    function. We perform an experimental evaluation of both, the effect on
    compilation time, and performance of generated codes.  The evaluation shows
    that our new framework, Pluto+, provides no degradation in performance in
    any of the Polybench benchmarks.  For Lattice Boltzmann Method (LBM) codes
    with periodic boundary conditions, it provides a mean speedup of 1.33$\times$ over
    Pluto.  We also show that Pluto+ does not increase compile times
    significantly.  Experimental results on Polybench show that Pluto+
    increases overall polyhedral source-to-source optimization time only by
    15%. In cases where it improves execution time significantly, it increased
    polyhedral optimization time only by 2.04$\times$.}
}

@inproceedings{Stock2014,
 author    = {Stock, Kevin and
              Kong, Martin and
              Grosser, Tobias and
              Pouchet, Louis-No{\"e}l and
              Rastello, Fabrice and
              Ramanujam, J. and
              Sadayappan, P.},
 title     = { A Framework for Enhancing Data Reuse via Associative Reordering},
 booktitle = {Conference on Programming Language Design and Implementation (PLDI)},
 year      = {2014},
}

@inproceedings{Tavarageri2014,
 author    = {Tavarageri, Sanket and
              Krishnamoorthy, Sriram and
              Sadayappan, P.},
 title     = {Compiler-Assisted Detection of Transient Memory Errors},
 booktitle = {Conference on Programming Language Design and Implementation (PLDI)},
 year      = {2014},
}

@inproceedings{Juega2014cgo,
 author = {Juega, Carlos and P\'{e}rez, Jos\'{e} Ignacio G\'{o}mez and Tenllado, Christian and Catthoor, Francky Catthoor},
 title = {Adaptive Mapping and Parameter Selection Scheme to Improve Automatic
Code Generation for GPUs},
 booktitle = {{International Symposium on Code Generation and Optimization (CGO)}},
 year = 2014,
 address = {Orlando, FL, United States},
}

@inproceedings{Grosser2014cgo,
 author = {Grosser, Tobias and Cohen, Albert and Holewinski, Justin and Sadayappan, P. and Verdoolaege, Sven},
 title = {{Hybrid Hexagonal/Classical Tiling for GPUs}},
 booktitle = {{International Symposium on Code Generation and Optimization (CGO)}},
 year = 2014,
 address = {Orlando, FL, United States},
 url = {http://hal.inria.fr/hal-00911177}
}

@inproceedings{Mehta2014ppopp,
 author = {Mehta, Sanyam and Lin, Pei-Hung and Yew, Pen-Chung},
 title = {Revisiting Loop Fusion in the Polyhedral Framework},
 booktitle = {Proceedings of the 19th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)},
 series = {PPoPP '14},
 year = {2014},
 url = {http://www-users.cs.umn.edu/~sanyam/publications/p233-mehta.pdf},
 abstract = {Loop fusion is an important compiler optimization for improving memory hierarchy performance through enabling data reuse. Traditional compilers have approached loop fusion in a manner decoupled from other high-level loop optimizations, missing several interesting solutions. Recently, the polyhedral compiler framework with its ability to compose complex transformations, has proved to be promising in performing loop optimizations for small programs. However, our experiments with large programs using state-of-the-art polyhedral compiler frameworks reveal suboptimal fusion partitions in the transformed code. We trace the reason for this to be lack of an effective cost model to choose a good fusion partitioning among the possible choices, which increase exponentially with the number of program statements. In this paper, we propose a fusion algorithm to choose good fusion partitions with two objective functions - achieving good data reuse and preserving parallelism inherent in the source code. These objectives, although targeted by previous work in traditional compilers, pose new challenges within the polyhedral compiler framework and have thus not been addressed. In our algorithm, we propose several heuristics that work effectively within the polyhedral compiler framework and allow us to achieve the proposed objectives. Experimental results show that our fusion algorithm achieves performance comparable to the existing polyhedral compilers for small kernel programs, and significantly outperforms them for large benchmark programs such as those in the SPEC benchmark suite.}
}

@Article{Jimborean2014speculative,
author="Jimborean, Alexandra
and Clauss, Philippe
and Dollinger, Jean-Fran{\c{c}}ois
and Loechner, Vincent
and Martinez Caama{\~{n}}o, Juan Manuel",
title="Dynamic and Speculative Polyhedral Parallelization Using Compiler-Generated Skeletons",
journal="International Journal of Parallel Programming",
year="2014",
volume="42",
number="4",
pages="529--545",
abstract="We propose a framework based on an original generation and use of algorithmic skeletons, and dedicated to speculative parallelization of scientific nested loop kernels, able to apply at run-time polyhedral transformations to the target code in order to exhibit parallelism and data locality. Parallel code generation is achieved almost at no cost by using binary algorithmic skeletons that are generated at compile-time, and that embed the original code and operations devoted to instantiate a polyhedral parallelizing transformation and to verify the speculations on dependences. The skeletons are patched at run-time to generate the executable code. The run-time process includes a transformation selection guided by online profiling phases on short samples, using an instrumented version of the code. During this phase, the accessed memory addresses are used to compute on-the-fly dependence distance vectors, and are also interpolated to build a predictor of the forthcoming accesses. Interpolating functions and distance vectors are then employed for dependence analysis to select a parallelizing transformation that, if the prediction is correct, does not induce any rollback during execution. In order to ensure that the rollback time overhead stays low, the code is executed in successive slices of the outermost original loop of the nest. Each slice can be either a parallel version which instantiates a skeleton, a sequential original version, or an instrumented version. Moreover, such slicing of the execution provides the opportunity of transforming differently the code to adapt to the observed execution phases, by patching differently one of the pre-built skeletons. The framework has been implemented with extensions of the LLVM compiler and an x86-64 runtime system. Significant speed-ups are shown on a set of benchmarks that could not have been handled efficiently by a compiler.",
issn="1573-7640",
doi="10.1007/s10766-013-0259-4",
url="http://dx.doi.org/10.1007/s10766-013-0259-4"
}

@inproceedings{Jimborean2014cgo,
 author = {Jimborean, Alexandra and Koukos, Konstantinos and Spiliopoulos, Vasileios and
 Black-Schaffer, David and Kaxiras, Stefanos},
 title = {{Fix the code. Don't tweak the hardware: A new compiler approach to Voltage-Frequency scaling}},
 booktitle = {{International Symposium on Code Generation and Optimization (CGO)}},
 year = 2014,
 address = {Orlando, FL, United States},
}

@inproceedings{Venkat2014cgo,
 author = {Venkat, Anand and Shantharam, Manu and Hall, Mary and Strout, Michelle},
 title = {{Non-affine Extensions to Polyhedral Code Generation}},
 booktitle = {{International Symposium on Code Generation and Optimization (CGO)}},
 year = 2014,
 address = {Orlando, FL, United States},
}

@InProceedings{Grosser2014Relation,
 author = 	 {Tobias Grosser and Sven Verdoolaege and Albert Cohen and P. Sadayappan},
 title =	{The relation between diamond tiling and hexagonal tiling},
 booktitle = {1st International Workshop on High-Performance Stencil Computations (HiStencils 2014)},
 address = {Vienna, Austria},
 month = 	 jan,
 year = 	 {2014},
 url = {http://www.exastencils.org/histencils/histencils2014.pdf#page=75},
 abstract = {
Iterative stencil computations are important in scientific computing and more
and more also in the embedded and mobile domain. Recent publications have shown
that tiling schemes that ensure concurrent start provide efficient ways to
execute these kernels. Diamond tiling and hybrid-hexagonal tiling are two
successful tiling schemes that enable concurrent start. Both have different
advantages: diamond tiling is integrated in a general purpose optimization
framework and uses a cost function to choose among tiling hyperplanes, whereas
the more flexible tile sizes of hybrid-hexagonal tiling have proven to be
effective for the generation of GPU code.

We show that these two approaches are even more interesting when combined. We
revisit the formalization of diamond and hexagonal tiling, present the effects
of tile size and wavefront choices on tile-level parallelism, and formulate
constraints for optimal diamond tile shapes. We then extend the diamond tiling
formulation into a hexagonal tiling one, combining the benefits of both. The
paper closes with an outlook of hexagonal tiling in higher dimensional spaces,
an important generalization suitable for massively parallel architectures.
}
}

@inproceedings{Darte2014impact,
 author = {Darte, Alain and Isoard, Alexandre},
 title = {Parametric Tiling with Inter-Tile Data Reuse},
 booktitle = {Proceedings of the
     4th International Workshop on Polyhedral Compilation Techniques},
 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
 year   = 2014,
 month  = Jan,
 address = {Vienna, Austria},
 url = {http://impact.gforge.inria.fr/impact2014/papers/impact2014-darte.pdf},
 abstract = {
  Loop tiling is a loop transformation widely used to improve spatial and
  temporal data locality, increase computation granularity, and enable blocking
  algorithms, which are particularly useful when offloading kernels on
  platforms with small memories. When hardware caches are not available, data
  transfers must be software-managed: they can be reduced by exploiting data
  reuse between tiles and, this way, avoid some useless external communications.
  An important parameter of loop tiling is the sizes of the tiles, which impact
  the size of the necessary local memory. However, for most analyzes that
  involve several tiles, which is the case for intertile data reuse, the tile
  sizes induce non-linear constraints, unless they are numerical constants.
  This complicates or prevents a parametric analysis. In this paper, we show
  that, actually, parametric tiling with inter-tile data reuse is nevertheless
  possible, i.e., it is possible to determine, at compile-time and in a
  parametric fashion, the copy-in and copy-out data sets for all tiles, with
  inter-tile reuse, as well as the sizes of the induced local memories, without
  the need to analyze the code for each tile size.
 }
}
@inproceedings{Guo2014impact,
 author = {Guo, Jing and Bernecky, Robert and
 		Thiyagalingam, Jeyarajan and Scholz, Sven-Bodo},
 title = {Polyhedral Methods for Improving Parallel Update-in-Place},
 booktitle = {Proceedings of the
     4th International Workshop on Polyhedral Compilation Techniques},
 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
 year   = 2014,
 month  = Jan,
 address = {Vienna, Austria},
 url = {http://impact.gforge.inria.fr/impact2014/papers/impact2014-guo.pdf},
 abstract = {
We demonstrate an optimization, denoted as polyhedral reuse analysis (PRA),
that uses polyhedral methods to improve the analysis of in-place update for
single-assignment arrays.  The PRA optimization attempts to determine when
parallel array operations that jointly deffine new arrays from existing ones can
reuse the memory of the existing arrays, rather than creating new ones.
Polyhedral representations and related dependency inference methods facilitate
that analysis.

In the context of SaC , we demonstrate the impact of this
optimisation using two non-trivial benchmarks evaluated on conventional shared
memory machines and on GPUs, obtaining performance improvements of 2-8 times
for LU Decomposition and of 2-10 times for Needleman-Wunsch, over the same
computations with PRA disabled.
 }
}
@inproceedings{Iooss2014impact,
 author = {Iooss, Guillaume and Rajopadhye, Sanjay and
 	Alias, Christophe and Zou, Yun},
 title = {CART: Constant Aspect Ratio Tiling},
 booktitle = {Proceedings of the
     4th International Workshop on Polyhedral Compilation Techniques},
 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
 year   = 2014,
 month  = Jan,
 address = {Vienna, Austria},
 url = {http://impact.gforge.inria.fr/impact2014/papers/impact2014-iooss.pdf},
 abstract = {
Parametric tiling is a well-known transformation which is widely used to improve
locality, parallelism and granularity.  However, parametric tiling is also a
non-linear transformation and this prevents polyhedral analysis or further
polyhedral transformation after parametric tiling. It is therefore generally
applied during the code generation phase.

In this paper, we present a method
to remain polyhedral, in a special case of parametric tiling, where all the
dimensions are tiled and all the tile sizes are constant multiples of a single
tile size parameter. We call this Constant Aspect Ratio Tiling . We show how to
mathematically transform a polyhedron and an affine function into their tiled
counterpart, which are the two main operations needed in such transformation.
}
}
@inproceedings{Li2014impact,
 author = {Li, Peng and Pouchet, Louis-No{\"e}l and Cong, Jason},
 title = {Throughput Optimization for High-Level Synthesis
 		Using Resource Constraints},
 booktitle = {Proceedings of the
     4th International Workshop on Polyhedral Compilation Techniques},
 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
 year   = 2014,
 month  = Jan,
 address = {Vienna, Austria}
}
@inproceedings{Mullapudi2014impact,
 author = {Mullapudi, Ravi Teja and Bondhugula, Uday},
 title = {Tiling for Dynamic Scheduling},
 booktitle = {Proceedings of the
     4th International Workshop on Polyhedral Compilation Techniques},
 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
 year   = 2014,
 month  = Jan,
 address = {Vienna, Austria},
 url = {http://impact.gforge.inria.fr/impact2014/papers/impact2014-mullapudi.pdf},
 abstract = {
Tiling is a key transformation used for coarsening the granularity of
parallelism and improving locality. It is known that current state-of-the-art
compiler approaches for tiling affine loop nests make use of sufficient, i.e.,
conservative conditions for the validity of tiling.  These conservative
conditions, which are used for static scheduling, miss tiling schemes for which
the tile schedule is not easy to describe statically. However, the partial
order of the tiles can be expressed using dependence relations which can be
used for dynamic scheduling at runtime. Another set of opportunities are missed
due to the classic reason that finding valid tiling hyperplanes is often harder
than checking whether a given tiling is valid.

Though the conservative
conditions for validity of tiling have worked in practice on a large number of
codes, we show that they fail to find the desired tiling in several cases –
some of these have dependence patterns similar to real world problems and
applications. We then look at ways to improve current techniques to address
this issue. To quantify the potential of the improved techniques, we manually
tile two dynamic programming algorithms – the Floyd-Warshall algorithm, and
Zuker’s RNA secondary structure prediction and report their performance on a
shared memory multicore. Our 3-d tiled dynamically scheduled implementation of
Zuker’s algorithm outperforms an optimized multi-core implementation GTfold by
a factor of 2.38. Such a 3-d tiling was possible only by reasoning with more
precise validity conditions
}
}
@inproceedings{Simbuerger2014impact,
 author = {Simb{\"u}rger, Andreas and Gr{\"o}{\ss}liger, Armin},
 title = {On the Variety of Static Control Parts in Real-World Programs:
     from Affine via Multi-dimensional to Polynomial and Just-in-Time},
 booktitle = {Proceedings of the
     4th International Workshop on Polyhedral Compilation Techniques},
 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
 year   = 2014,
 month  = Jan,
 address = {Vienna, Austria},
 url = {http://impact.gforge.inria.fr/impact2014/papers/impact2014-simbuerger.pdf},
 abstract = {
The polyhedron model has been used successfully for automatic parallelization
of code regions with loop nests satisfying certain restrictions, so-called
static control parts. A popular implementation of this model is Polly (an
extension of the LLVM compiler), which is able to identify static control parts
in the intermediate representation of the compiler. We look at static control
parts found in 50 real-world programs from different domains. We study whether
these programs are amenable to polyhedral optimization by Polly at compile time
or at run time. We report the number of static control parts with uniform or
authorne dependences found and study extensions of the current implementation
in Polly . We consider extensions which handle multi-dimensional arrays with
parametric sizes and arrays represented by "pointer-to-pointer" constructs. In
addition, we extend the modeling capabilities of Polly to a model using
semi-algebraic sets and real algebra instead of polyhedra and linear algebra.
We do not only consider the number and size of the code regions found but
measure the share of the run time the studied programs spend in the identified
regions for each of the classes of static control parts under study.
}
}
@inproceedings{Verdoolaege2014impact,
 author = {Verdoolaege, Sven and Guelton, Serge and
 	Grosser, Tobias and Cohen, Albert},
 title = {Schedule Trees},
 booktitle = {Proceedings of the
     4th International Workshop on Polyhedral Compilation Techniques},
 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
 year   = 2014,
 month  = Jan,
 address = {Vienna, Austria},
 url = {http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf},
 abstract = {
 Schedules in the polyhedral model, both those that represent the original
execution order and those produced by scheduling algorithms, naturally have the
form of a tree. Generic schedule representations proposed in the literature
encode this tree structure such that it is only implicitly available.
Following the internal representation of isl , we propose to represent
schedules as explicit trees and further extend the concept by introducing
different kinds of nodes. We compare our schedule trees to other
representations in detail and illustrate how they have been successfully used
to simplify the implementation of a non-trivial polyhedral compiler.
 }
}
@inproceedings{Wang2014impact,
 author = {Wang, Wei and Cavazos, John and Porterfield, Allan},
 title = {Energy Auto-tuning using the Polyhedral Approach},
 booktitle = {Proceedings of the
     4th International Workshop on Polyhedral Compilation Techniques},
 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
 year   = 2014,
 month  = Jan,
 address = {Vienna, Austria},
 url = {http://impact.gforge.inria.fr/impact2014/papers/impact2014-wang.pdf},
 abstract = {
As the HPC community moves into the exascale computing era, application energy
has become a big concern. Tuning for energy will be essential in the effort to
overcome the limited power envelope. How is tuning for lower energy related to
tuning for faster execution? Understanding that relationship can guide both
performance and energy tuning for exascale.  In this paper, a strong
correlation is presented between the two that allows tuning for execution to be
used as a proxy for energy tuning. We also show that polyhedral compilers can
ectively tune a realistic application for both time and energy.

For a large
number of variants of the Polybench programs and LULESH energy consumption is
strongly correlated with total execution time. Optimizations can increase the
power and energy required between variants, but the variant with minimum
execution time also has the lowest energy usage. The polyhedral framework was
also used to optimize a 2D cardiac wave propagation simulation application.
Various loop optimizations including fusion, tiling, vectorization, and
auto-parallelization, achieved a 20% speedup over the baseline OpenMP
implementation, with an equivalent reduction in energy on an Intel Sandy Bridge
system.  On an Intel Xeon Phi system, improvements as high as 21% in execution
time and 19% reduction in energy are obtained

}
}
@inproceedings{Yuki2014impact,
 author = {Yuki, Tomofumi},
 title = {Understanding {PolyBench/C} 3.2 Kernels},
 booktitle = {Proceedings of the
     4th International Workshop on Polyhedral Compilation Techniques},
 editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
 year   = 2014,
 month  = Jan,
 address = {Vienna, Austria},
 url = {http://impact.gforge.inria.fr/impact2014/papers/impact2014-yuki.pdf},
 abstract = {
 In this position paper, we argue the need for more rigorous specification of
kernels in the PolyBench/C benchmark suite.  Currently, the benchmarks are
mostly specified by their implementation as C code, with a one sentence
description of what the code is supposed to do. While this is suchcient in the
context of automated loop transformation, the lack of precise specification may
have let some questionable behaviors as benchmark kernels remain unnoticed.

As
an extreme example, two kernels in PolyBench/C 3.2 exhibit parametric speed up
with respect to the problem size when its questionable properties are used.
Abusing such properties can provide arbitrary speedup, which can be some factor
of millions, potentially threatening the credibility of any experimental
evaluation using PolyBench.
 }
}

@article{ShirakOil,
  title={Oil and Water can mix! Experiences with integrating Polyhedral and AST-based Transformations},
  author={Shirako, Jun and Sarkar, Vivek},
  booktitle = {17th Workshop Compilers for Parallel Computing (CPC)},
  year = 2013
}

@inproceedings{Konstantinidis2013parametric,
  title={Parametric GPU code generation for affine loop programs},
  author={Konstantinidis, Athanasios and Kelly, Paul HJ and Ramanujam, J and Sadayappan, P},
  booktitle={International Workshop on Languages and Compilers for Parallel Computing},
  pages={136--151},
  year={2013},
  organization={Springer},
  url={https://parasol.tamu.edu/lcpc2013/papers/lcpc2013_submission_21.pdf},
  abstract={
  Partitioning a parallel computation into finitely sized chunks for effective
  mapping onto a parallel machine is a critical concern for source-to-source
  compilation. In the context of OpenCL and CUDA, this translates to the definition
  of a uniform hyper-rectangular partitioning of the parallel execution space
  where each partition is subject to a fine-grained distribution of resources that has
  a direct yet hard to estimate impact on performance. This paper develops the first
  compilation scheme for generating parametrically tiled codes for affine loop programs
  on GPUs which facilitates run-time exploration of partitioning parameters
  as a fast and portable way of finding the ones that yield maximum performance.
  Our approach is based on a parametric tiling scheme for producing wavefronts
  of parallel rectangular partitions of parametric size and a novel runtime system
  that manages wavefront execution and local memory usage dynamically through
  an inspector-executor mechanism. Our experimental evaluation demonstrates the
  effectiveness of our approach for wavefront as well as rectangularly-parallel partitionings.
  }
}

@inproceedings{Kong2013polyhedral,
 title={When polyhedral transformations meet SIMD code generation},
 author={Kong, Martin and Veras, Richard and Stock, Kevin and Franchetti, Franz and Pouchet, Louis-No{\"e}l and Sadayappan, P},
 booktitle={Proceedings of the 34th ACM SIGPLAN conference on Programming language design and implementation},
 pages={127--138},
 year={2013},
 organization={ACM},
 url={http://users.ece.cmu.edu/~franzf/papers/pldi13.pdf},
 abstract={
Data locality and parallelism are critical optimization objectives for
performance on modern multi-core machines. Both coarse-grain parallelism (e.g.,
multi-core) and fine-grain parallelism (e.g., vector SIMD) must be effectively
exploited, but despite decades of progress at both ends, current compiler
optimization schemes that attempt to address data locality and both kinds of
parallelism often fail at one of the three objectives.

We address this problem by proposing a 3-step framework, which aims for
integrated data locality, multi-core parallelism and SIMD execution of
programs. We define the concept of vectorizable codelets, with properties
tailored to achieve effective SIMD code generation for the codelets. We
leverage the power of a modern high-level transformation framework to
restructure a program to expose good ISA-independent vectorizable codelets,
exploiting multi-dimensional data reuse. Then, we generate ISA-specific
customized code for the codelets, using a collection of lower-level
SIMD-focused optimizations.

We demonstrate our approach on a collection of numerical kernels that we
automatically tile, parallelize and vectorize, exhibiting significant
performance improvements over existing compilers.
}
}

@inproceedings{Grosser2013split,
 title={Split tiling for {GPU}s: automatic parallelization using trapezoidal tiles},
 author={Grosser, Tobias and Cohen, Albert and Kelly, Paul HJ and Ramanujam, J and Sadayappan, P and Verdoolaege, Sven},
 booktitle={GPGPU-6},
 pages={24--31},
 year={2013},
 organization={ACM},
 pdf = {http://hal.inria.fr/hal-00786812/PDF/paper.pdf},
 abstract = {
Tiling is a key technique to enhance data reuse. For computations structured as
one sequential outer "time" loop enclosing a set of parallel inner loops,
tiling only the parallel inner loops may not enable enough data reuse in the
cache. Tiling the inner loops along with the outer time loop enhances data
locality but may require other transformations like loop skewing that inhibit
inter-tile parallelism.

One approach to tiling that enhances data locality without inhibiting
inter-tile parallelism is split tiling, where tiles are subdivided into a
sequence of trapezoidal computation steps. In this paper, we develop an
approach to generate split tiled code for GPUs in the PPCG polyhedral code
generator. We propose a generic algorithm to calculate index-set splitting that
enables us to perform tiling for locality and synchronization avoidance, while
simultaneously maintaining parallelism, without the need for skewing or
redundant computations. Our algorithm performs split tiling for an arbitrary
number of dimensions and without the need to construct any large integer linear
program. The method and its implementation are evaluated on standard stencil
kernels and compared with a state-of-the-art polyhedral compiler and with a
domain-specific stencil compiler, both targeting CUDA GPUs.
}
}

@article{Mehta2013taco,
 author = {Mehta, Sanyam and Beeraka, Gautham and Yew, Pen-Chung},
 title = {Tile Size Selection Revisited},
 journal = {ACM Transactions on Architecture and Code Optimization (TACO)},
 issue_date = {December 2013},
 volume = {10},
 number = {4},
 month = dec,
 year = {2013},
 url = {http://www-users.cs.umn.edu/~sanyam/publications/a35-mehta.pdf},
 abstract = {Loop tiling is a widely used loop transformation to enhance data locality and allow data reuse. In the tiled code, however, tiles of different sizes can lead to significant variation in performance. Thus, selection of an optimal tile size is critical to performance of tiled codes.
In the past, tile size selection has been attempted using both static analytical and dynamic empirical (auto-tuning) models. Past work using static models assumed a direct-mapped cache for the purpose of analysis and thus proved to be less robust. On the other hand, the auto-tuning models involve an exhaustive search in a large space of tiled codes. In this article, we propose a new analytical model for tile size selection that leverages the high set associativity in modern caches to minimize conflict misses. Our tile size selection model targets data reuse in multiple levels of cache. In addition, it considers the interaction of tiling with the SIMD unit in modern processors in estimating the optimal tile size. We find that these factors, not considered in previous models, are critical in developing a robust model for tile size selection. We implement our tile size selection model in a polyhedral compiler and test it on 12 benchmark kernels using two different problem sizes. Our model outperforms the previous analytical models that are based on reusing data in a single level of cache and achieves an average performance improvement of 9.7\% and 20.4\%, respectively, over the best square (cubic) tiles for the two problem sizes. In addition, the tile size chosen by our tile size selection algorithm is similar to the best performing size obtained through an extensive search, validating the analytical model underlying the algorithm.}
}

@ARTICLE{Gonzalez2013tpds,
  author =  "A. Gonzalez-Escribano and Y. Torres and J. Fresno and D. Llanos",
  title =  "An extensible system for multilevel automatic data partition and mapping",
  journal =  "IEEE Transactions on Parallel and Distributed Systems ",
  year =  "2013",
  month =  "March",
  doi="10.1109/TPDS.2013.83",
  abstract = "
Automatic data distribution is a key feature to obtain efficient
implementations from abstract and portable parallel codes. We present a highly
efficient and extensible runtime library that integrates techniques for
automatic data partition and mapping. It uses a novel approach to define an
abstract interface and a plug-in system to encapsulate different types of
regular and irregular techniques, helping to generate codes which are
independent of the exact mapping functions selected. Currently, it supports
hierarchical tiling of arrays with dense and stride domains, that allows the
implementation of both data and task parallelism using a SPMD model. It
automatically computes appropriate domain partitions for a selected virtual
topology, mapping them to available processors with static or dynamic
load-balancing techniques. Our library also allows the construction of reusable
communication patterns that efficiently exploit MPI communication capabilities.
The use of our library greatly reduces the complexity of data distribution and
communication, hiding the details of the underlying architecture. The library
can be used as an abstract layer for building generic tiling operations as
well. Our experimental results show that the use of this library allows to
achieve similar performance as carefully-implemented manual versions for
several, well-known parallel kernels and benchmarks in distributed and
multicore systems, and substantially reduces programming effort.

",
  url = "http://www.computer.org/csdl/trans/td/preprint/06482561-abs.html"
}

@ARTICLE{Fresno2013js,
  author =  "J. Fresno and A. Gonzalez-Escribano and D. Llanos",
  title =  "Extending a Hierarchical Tiling Arrays Library to Support Sparse Data Partitioning",
  journal =  "The Journal of Supercomputing",
  year =  "2013",
  volume =  "64",
  number =  "1",
  pages =  "59--68",
  month =  "April",
  doi = "10.1007/s11227-012-0757-y",
  abstract = "
Layout methods for dense and sparse data are often seen as two separate
problems with their own particular techniques. However, they are based on the
same basic concepts. This paper studies how to integrate automatic data-layout
and partition techniques for both dense and sparse data structures. In
particular, we show how to include support for sparse matrices or graphs in
Hitmap, a library for hierarchical tiling and automatic mapping of arrays. The
paper shows that it is possible to offer a unique interface to work with both
dense and sparse data structures. Thus, the programmer can use a single and
homogeneous programming style, reducing the development effort and simplifying
the use of sparse data structures in parallel computations. Our experimental
evaluation shows that this integration of techniques can be effectively done
without compromising performance.
",
  url = "http://link.springer.com/article/10.1007%2Fs11227-012-0757-y"
}

@INPROCEEDINGS{Torres2013pdpta,
  author =  "Y. Torres and A. Gonzalez-Escribano and D. Llanos",
  title =  "Automatic Run-time Mapping of Polyhedral Computations to Heterogeneous Devices with Memory-size Restrictions",
  booktitle =  "PDPTA'13 - The 2013 International Conference on Parallel and Distributed Processing Techniques and Applications",
  year =  "2013",
  volume =  "2",
  month =  "July",
  publisher =  "CSREA Press",
  isbn = "1-60132-256-9, 1-60132-257-7 (1-60132-258-5)",
  abstract = "
Tools that aim to automatically map parallel computations to heterogeneous and
hierarchical systems try to divide the whole computation in parts with
computational loads adjusted to the capabilities of the target devices. Some
parts are executed in node cores, while others are executed in accelerator
devices.  Each part requires one or more data-structure pieces that should be
allocated in the device memory during the computation.

In this paper we present a model that allows such automatic mapping tools to
transparently assign computations to heterogeneous devices with different
memory size restrictions. The model requires the programmer to specify the
access patterns of the computation threads in a simple abstract form. This
information is used at run-time to determine the second-level partition of the
computation assigned to a device, ensuring that the data pieces required by
each sub-part fit in the target device memory, and that the number of kernels
launched is minimal.  We present experimental results with a prototype
implementation of the model that works for regular polyhedral expressions. We
show how it works for different example applications and access patterns,
transparently executing big computations in devices with different memory size
restrictions.
",
  url = {http://www.infor.uva.es/~diego/docs/torres13.pdf}
}

@techreport{Grosser2013Promises,
 hal_id = {hal-00848691},
 url = {http://hal.inria.fr/hal-00848691},
 title = {{The Promises of Hybrid Hexagonal/Classical Tiling for GPU}},
 author = {Grosser, Tobias and Verdoolaege, Sven and Cohen, Albert and Sadayappan, P.},
 abstract = {
  Time-tiling is necessary for efficient execution of iterative
  stencil computations. But the usual hyper-rectangular tiles cannot
  be used because of positive/negative dependence distances along the
  stencil's spatial dimensions. Several prior efforts have addressed
  this issue. However, known techniques trade enhanced data reuse
  for other causes of inefficiency, such as unbalanced parallelism,
  redundant computations, or increased control flow overhead incompatible
  with efficient GPU execution. We explore a new path to maximize the
  effectivness of time-tiling on iterative stencil computations. Our
  approach is particularly well suited for GPUs. It does not require
  any redundant computations, it favors coalesced global-memory access
  and data reuse in shared-memory/cache, avoids thread divergence, and
  extracts a high degree of parallelism. We introduce hybrid hexagonal
  tiling, combining hexagonal tile shapes along the time (sequential)
  dimension and one spatial dimension, with classical tiling for other
  spatial dimensions. An hexagonal tile shape simultaneously enable
  parallel tile execution and reuse along the time dimension. Experimental
  results demonstrate significant performance improvements over existing
  stencil compilers.
 },
 affiliation = {PARKAS - INRIA Paris-Rocquencourt, Department of Computer
 Science and Engineering - CSE},
 type = {Rapport de recherche},
 institution = {INRIA},
 number = {RR-8339},
 year = {2013},
 month = Jul,
 pdf = {http://hal.inria.fr/hal-00848691/PDF/RR-8339.pdf},
}

@article{Verdoolaege2013PPCG,
 title = {Polyhedral parallel code generation for {CUDA}},
 author = {Verdoolaege, Sven and Juega, Juan Carlos and Cohen, Albert and
           G\'{o}mez, Jos{\'e} Ignacio and Tenllado, Christian and
           Catthoor, Francky},
 journal = {ACM Transactions on Architecture and Code Optimization},
 issue_date = {January 2013},
 volume = {9},
 number = {4},
 month = jan,
 year = {2013},
 issn = {1544-3566},
 pages = {54:1--54:23},
 doi = {10.1145/2400682.2400713},
 acmid = {2400713},
 publisher = {ACM},
 address = {New York, NY, USA},
}

@proceedings{impact2013,
  title  = "{P}roceedings of the 3rd {I}nternational {W}orkshop on {P}olyhedral {C}ompilation {T}echniques",
  editor = "Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l",
  year   = 2013,
  month  = Jan,
  address = "Berlin, Germany",
  url     = "http://nbn-resolving.de/urn:nbn:de:bvb:739-opus-26930",
  note    = "http://impact.gforge.inria.fr/impact2013/"
}

@inproceedings{feld.2013.impact,
  author = "Feld, Dustin and Soddemann, Thomas and J{\"u}nger, Michael and Mallach, Sven",
  title  = "{F}acilitate {SIMD}-{C}ode-{G}eneration in the {P}olyhedral {M}odel by {H}ardware-aware {A}utomatic {C}ode-{T}ransformation",
  pages  = "45--54",
  booktitle = "{P}roceedings of the 3rd {I}nternational {W}orkshop on {P}olyhedral {C}ompilation {T}echniques",
  editor = "Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l",
  year   = 2013,
  month  = Jan,
  address = "Berlin, Germany",
  url = "http://impact.gforge.inria.fr/impact2013/papers/impact2013_facilitate_simd_code_generation.pdf",
  abstract = {
Although Single Instruction Multiple Data (SIMD) units are available in general
purpose processors already since the 1990s, state-of-the-art compilers are
often still not capable to fully exploit them, i.e., they may miss to achieve
the best possible performance.

We present a new hardware-aware and adaptive
loop tiling approach that is based on polyhedral transformations and explicitly
dedicated to improve on auto-vectorization. It is an extension to the tiling
algorithm implemented within the PluTo framework [4, 5]. In its default
setting, PluTo uses static tile sizes and is already capable to enable the use
of SIMD units but not primarily targeted to optimize it. We experimented with
differnt tile sizes and found a strong relationship between their choice, cache
size parameters and performance. Based on this, we designed an adaptive
procedure that speciffically tiles vectorizable loops with dynamically
calculated sizes. The blocking is automatically fitted to the amount of data
read in loop iterations, the available SIMD units and the cache sizes. The
adaptive parts are built upon straightforward calculations that are
experimentally verified and evaluated. Our results show significant
improvements in the number of instructions vectorized, cache miss rates and,
finally, running times.
  }
}

@inproceedings{yuki.2013.impact,
  author = "Yuki, Tomofumi and Rajopadhye, Sanjay",
  title  = "{M}emory {A}llocations for {T}iled {U}niform {D}ependence {P}rograms",
  pages  = "13--22",
  booktitle = "{P}roceedings of the 3rd {I}nternational {W}orkshop on {P}olyhedral {C}ompilation {T}echniques",
  editor = "Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l",
  year   = 2013,
  month  = Jan,
  address = "Berlin, Germany",
  url = "http://impact.gforge.inria.fr/impact2013/papers/impact2013_memory_allocations_for_tiled_uniform_dependence_programs.pdf",
  abstract = {
In this paper, we develop a series of extensions to schedule-independent
storage mapping using Quasi-Universal Occupancy Vectors (QUOVs) targeting tiled
execution of polyhedral programs. By quasi-universality, we mean that we
restrict the \universe" of the schedule to those that correspond to tiling.
This provides the following benefits: (i) the shortest QUOVs may be shorter
than the fully universal ones, (ii) the shortest QUOVs can be found without any
search, and (iii) multi-statement programs can be handled.  The resulting
storage mapping is valid for tiled execution by any tile size.
}
}

@inproceedings{fassi.2013.impact,
  author = "Fassi, Im{\`e}n and Clauss, Philippe and Kuhn, Matthieu and Slama, Yosr",
  title  = "{M}ultifor for {M}ulticore",
  pages  = "37--44",
  booktitle = "{P}roceedings of the 3rd {I}nternational {W}orkshop on {P}olyhedral {C}ompilation {T}echniques",
  editor = "Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l",
  year   = 2013,
  month  = Jan,
  address = "Berlin, Germany",
  url = "http://impact.gforge.inria.fr/impact2013/papers/impact2013_multifor_for_multicore.pdf",
  abstract = {
 We propose a new programming control structure called "multifor", allowing to
take advantage of parallelization models that were not naturally attainable
with the polytope model before. In a multifor-loop, several loops whose bodies
are run simultaneously can be defined. Respective iteration domains are mapped
onto each other according to a run frequency - the grain - and a relative
position - the offset -. Execution models like dataflow, stencil computations or
MapReduce can be represented onto one referential iteration domain, while still
exhibiting traditional polyhedral code analysis and transformation
opportunities. Moreover, this construct provides ways to naturally exploit
hybrid parallelization models, thus significantly improving parallelization
opportunities in the multicore era. Traditional polyhedral software tools are
used to generate the corresponding code. Additionally, a promising perspective
related to nonlinear mapping of iteration spaces is also presented, yielding to
run a loop nest inside any other one by solving the problem of inverting
"ranking Ehrhart polynomials".
  }
}

@inproceedings{verdoolaege.2013.impact,
  author = "Verdoolaege, Sven and Nikolov, Hristo and Stefanov, Todor",
  title  = "{O}n {D}emand {P}arametric {A}rray {D}ataflow {A}nalysis",
  pages  = "23--36",
  booktitle = "{P}roceedings of the 3rd {I}nternational {W}orkshop on {P}olyhedral {C}ompilation {T}echniques",
  editor = "Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l",
  year   = 2013,
  month  = Jan,
  address = "Berlin, Germany",
  url = "http://impact.gforge.inria.fr/impact2013/papers/impact2013_on_demand_parametric_array_dataflow_analysis.pdf",
  abstract = {
We present a novel approach for exact array data ow analysis in the presence of
constructs that are not static authorne.  The approach is similar to that of
fuzzy array data ow analysis in that it also introduces parameters that
represent information that is only available at run-time, but the parameters
have a diㄦent meaning and are analyzed before they are introduced. The
approach was motivated by our work on process networks, but should be generally
useful since fewer parameters are introduced on larger inputs. We include some
preliminary experimental results.
}
}

@inproceedings{wonnacott.2013.impact,
  author = {Wonnacott, David G. and Mills Strout, Michelle},
  title  = "{O}n the {S}calability of {L}oop {T}iling {T}echniques",
  pages  = "3--11",
  booktitle = "{P}roceedings of the 3rd {I}nternational {W}orkshop on {P}olyhedral {C}ompilation {T}echniques",
  editor = "Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l",
  year   = 2013,
  month  = Jan,
  address = "Berlin, Germany",
  url = "http://impact.gforge.inria.fr/impact2013/papers/impact2013_on_the_scalability_of_loop_tiling_techniques.pdf",
  abstract = {
The Polyhedral model has proven to be a valuable tool for improving memory
locality and exploiting parallelism for optimizing dense array codes. This
model is expressive enough to describe transformations of imperfectly nested
loops, and to capture a variety of program transformations, including many
approaches to loop tiling. Tools such as the highly successful PLuTo automatic
parallelizer have provided empirical confirmation of the success of
polyhedral-based optimization, through experiments in which a number of
benchmarks have been executed on machines with small- to medium-scale
parallelism.

In anticipation of ever higher degrees of parallelism, we have
explored the impact of various loop tiling strategies on the asymptotic degree
of available parallelism. In our analysis, we consider “weak scaling” as
described by Gustafson, i.e., in which the data set size grows linearly with
the number of processors available. Some, but not all, of the approaches to
tiling provide weak scaling. In particular, the tiling currently performed by
PLuTo does not scale in this sense.

In this article, we review approaches to
loop tiling in the published literature, focusing on both scalability and
implementation status. We find that fully scalable tilings are not available in
general-purpose tools, and call upon the polyhedral compilation community to
focus on questions of asymptotic scalability. Finally, we identify ongoing work
that may resolve this issue.
  }
}

@inproceedings{doerfert.2013.impact,
  author = "Doerfert, Johannes and Hammacher, Clemens and Streit, Kevin and Hack, Sebastian",
  title  = "{SP}olly: {S}peculative {O}ptimizations in the {P}olyhedral {M}odel",
  pages  = "55--60",
  booktitle = "{P}roceedings of the 3rd {I}nternational {W}orkshop on {P}olyhedral {C}ompilation {T}echniques",
  editor = "Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l",
  year   = 2013,
  month  = Jan,
  address = "Berlin, Germany",
  url = "http://impact.gforge.inria.fr/impact2013/papers/impact2013_spolly.pdf",
  abstract = {
The polyhedral model is only applicable to code regions that form static
control parts (SCoPs) or slight extensions thereof. To apply polyhedral
techniques to a piece of code, the compiler usually checks, by static analysis,
whether all SCoP conditions are fulfilled. However, in many codes, the compiler
fails to verify that this is the case. In this paper we investigate the
rejection causes as reported by Polly , the polyhedral optimizer of a
state-of-the-art compiler. We show that many rejections follow from the
conservative overapproximation of the employed static analyses. In SPolly , a
speculative extension of Polly, we employ the knowledge of runtime features to
supersede this overapproximation. All speculatively generated variants form
valid SCoPs and are optimizable by the facilities of Polly. Our evaluation
shows that SPolly is able to ectively widen the applicability of polyhedral
optimization. On the SPEC 2000 suite, the number of optimizable code regions is
increased by 131 percent.  In 10 out of the 31 benchmarks of the PolyBench
suite, SPolly achieves speedups of up to 11-fold as compared to plain Polly.
  }
}

@inproceedings{Zuo2013,
 author = {Zuo, Wei and Li, Peng and Chen, Deming and Pouchet, Louis-No{\"e}l and Zhong, Shunan and Cong, Jason},
 title = {Improving Polyhedral Code Generation for High-Level Synthesis},
 booktitle = "International Conference on Hardware/Software Codesign and System Synthesis",
 year = 2013,
 url = "http://vast.cs.ucla.edu/sites/default/files/publications/Improving%20Polyhedral%20Code%20Generation%20for%20High-Level%20Synthesis.pdf",
 abstract = "
High-level synthesis (HLS) tools are now capable of generating high-quality RTL
codes for a number of programs. Nevertheless, for best performance aggressive
program transformations are still required to exploit data reuse and enable
communication/computation overlap. The polyhedral compilation framework has
shown great promise in this area with the development of HLS-specific
polyhedral transformation techniques and tools. However, all these techniques
rely on polyhedral code generation to translate a schedule for the program's
operations into an actual C code that is input to the HLS tool. In this work we
study the changes to the state-of-the-art polyhedral code generator CLooG which
are required to tailor it for HLS purposes. In particular, we develop various
techniques to significantly improve resource utilization on the FPGA. We also
develop a complete technique geared towards effective code generation of
rectangularly tiled code, leading to further improvements in resource
utilization. We demonstrate our techniques on a collection of affine
benchmarks, reducing by 2x on average (up to 10x) the area used after
high-level synthesis.
"
}

@article{Khan2013cudachill,
 author    = {Malik Murtaza Khan and
              Protonu Basu and
              Gabe Rudy and
              Mary W. Hall and
              Chun Chen and
              Jacqueline Chame},
 title     = {A script-based autotuning compiler system to generate high-performance
              CUDA code},
 journal   = {ACM Transactions on Architecture and Code Optimization (TACO)},
 volume    = {9},
 number    = {4},
 year      = {2013},
 pages     = {31},
 url = "http://www.cs.utah.edu/~mhall/cs6235s13/hipeac13.pdf",
 abstract = "
This article presents a novel compiler framework for CUDA code generation. The
compiler structure is designed to support autotuning, which employs empirical
techniques to evaluate a set of alternative mappings of computation kernels and
select the mapping that obtains the best performance. This article introduces a
Transformation Strategy Generator, a meta-optimizer that generates a set of
transformation recipes, which are descriptions of the mapping of the sequential
code to parallel CUDA code. These recipes comprise a search space of possible
implementations. This system achieves performance comparable and sometimes
better than manually tuned libraries and exceeds the performance of a
state-of-the-art GPU compiler.
"
}

@inproceedings{Henretty2013stencil,
 author = {Henretty, Tom and Veras, Richard and Franchetti, Franz and Pouchet, Louis-No{\"e}l and Ramanujam, J. and Sadayappan, P.},
 title = {A stencil compiler for short-vector {SIMD} architectures},
 booktitle = {International Conference on Supercomputing (ICS)},
 publisher = {ACM},
 year = {2013},
 url = "http://www.cs.ucla.edu/~pouchet/doc/ics-article.13.pdf",
 abstract = "
Stencil computations are an integral component of applications in a
number of scientific computing domains. Short-vector SIMD instruction
sets are ubiquitous on modern processors and can be used to significantly
increase the performance of stencil computations. Traditional approaches
to optimizing stencils on these platforms have focused on either
short-vector SIMD or data locality optimizations. In this paper, we
propose a domain specific language and compiler for stencil computations
that allows specification of stencils in a concise manner and automates
both locality and short-vector SIMD optimizations, along with effective
utilization of multi-core parallelism. Loop transformations to enhance
data locality and enable load-balanced parallelism are combined with a
data layout transformation to effectively increase the performance of
stencil computations. Performance increases are demonstrated for a number
of stencils on several modern SIMD architectures.
"
}

@inproceedings{Bandishti2012,
 title={Tiling stencil computations to maximize parallelism},
 author={Bandishti, Vinayaka and Pananilath, Irshad and Bondhugula, Uday},
 booktitle={Supercomputing},
 pages={40},
 year={2012},
 organization={IEEE Computer Society Press},
 url={http://www.csa.iisc.ernet.in/~uday/publications/stencils_sc12.pdf},
 abstract = "
Most stencil computations allow tile-wise concurrent start, i.e., there always
exists a face of the iteration space and a set of tiling hyperplanes such that
all tiles along that face can be started concurrently. This provides load
balance and maximizes parallelism. However, existing automatic tiling
frameworks often choose hyperplanes that lead to pipelined start-up and load
imbalance. We address this issue with a new tiling technique that ensures
concurrent start-up as well as perfect load-balance whenever possible. We first
provide necessary and sufficient conditions on tiling hyperplanes to enable
concurrent start for programs with affine data accesses. We then provide an
approach to find such hyperplanes. Experimental evaluation on a 12-core Intel
Westmere shows that our code is able to outperform a tuned domain-specific
stencil code generator by 4% to 27%, and previous compiler techniques by a
factor of 2x to 10.14 x.
"
}

@inproceedings{Holewinski2012,
 title={High-performance code generation for stencil computations on {GPU} architectures},
 author={Holewinski, Justin and Pouchet, Louis-No{\"e}l and Sadayappan, P},
 booktitle={International Conference on Supercomputing (ICS)},
 year={2012},
 url={http://www.cse.ohio-state.edu/~pouchet/doc/ics-article.12.pdf},
 abstract= "
Stencil computations arise in many scientific computing domains, and often
represent time-critical portions of applications. There is significant interest
in offloading these computations to high-performance devices such as GPU
accelerators, but these architectures offer challenges for developers and
compilers alike. Stencil computations in particular require careful attention
to off-chip memory access and the balancing of work among compute units in GPU
devices.

In this paper, we present a code generation scheme for stencil computations on
GPU accelerators, which optimizes the code by trading an increase in the
computational workload for a decrease in the required global memory bandwidth.
We develop compiler algorithms for automatic generation of efficient,
time-tiled stencil code for GPU accelerators from a high-level description of
the stencil operation. We show that the code generation scheme can achieve high
performance on a range of GPU architectures, including both nVidia and AMD
devices.
"
}

@inproceedings{Ravishankar2012sc,
 author = {Ravishankar, Mahesh and Eisenlohr, John and Pouchet, Louis-No{\"e}l
and Ramanujam, J. and Rountev, Atanas and Sadayappan, P.},
 title = {Code Generation for Parallel Execution of a Class of Irregular Loops on Distributed Memory Systems},
 booktitle = {Supercomputing},
 year = {2012},
 location = {Salt Lake City, Utah},
 pages = {1--11},
 url = {http://www.cse.ohio-state.edu/presto/pubs/sc12.pdf},
 abstract = "
Parallelization and locality optimization of affine loop nests has been
successfully addressed for shared-memory machines. However, many large-scale
simulation applications must be executed in a distributed-memory environment,
and use irregular/sparse computations where the control-flow and array-access
patterns are data-dependent.

In this paper, we propose an approach for effective parallel execution of a
class of irregular loop computations in a distributed-memory environment, using
a combination of static and runtime analysis. We discuss algorithms that
analyze sequential code to generate an inspector and an executor. The inspector
captures the data-dependent behavior of the computation in parallel and without
requiring complete replication of any of the data structures used in the
original computation. The executor performs the computation in parallel. The
effectiveness of the framework is demonstrated on several benchmarks and a
climate modeling application.
"
}

@inproceedings{Chen2012,
 author = {Chen, Chun},
 title = {Polyhedra scanning revisited},
 booktitle = {Conference on Programming Language Design and Implementation},
 year = {2012},
 pages = {499--508},
 numpages = {10},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {polyhedra scanning, polyhedral transformations},
 url = "http://ctop.cs.utah.edu/downloads/pldi128-chen.pdf",
 abstract = "
This paper presents a new polyhedra scanning system called CodeGen+ to
address the challenge of generating high-performance code for complex iteration
spaces resulting from compiler optimization and autotuning systems. The
strength of our approach lies in two new algorithms. First, a loop overhead
removal algorithm provides precise control of trade-offs between loop overhead
and code size based on actual loop nesting depth. Second, an if-statement
simplification algorithm further reduces the number of comparisons in the code.
These algorithms combined with the expressive power of Presburger arithmetic
enable CodeGen+ to support complex optimization strategies expressed in
iteration spaces. We compare with the state-of-the-art polyhedra scanning tool
CLooG on five loop nest computations, demonstrating that CodeGen+ generates
code that is simpler and up to 1.15x faster.
"
}

@article{Grosser2012polly,
 title = {Polly -- Performing polyhedral optimizations on a low-level intermediate representation},
 author = {Grosser, Tobias and Gr{\"o}{\ss}linger, Armin and Lengauer, Christian},
 journal = {Parallel Processing Letters},
 volume = {22},
 number = {04},
 year = {2012},
 publisher = {World Scientific},
 abstract = {
The polyhedral model for loop parallelization has proved to be an effective
tool for advanced optimization and automatic parallelization of programs in
higher-level languages. Yet, to integrate such optimizations seamlessly into
production compilers, they must be performed on the compiler's internal,
low-level, intermediate representation (IR). With Polly, we present an
infrastructure for polyhedral optimizations on such an IR. We describe the
detection of program parts amenable to a polyhedral optimization (so-called
static control parts), their translation to a Z-polyhedral representation,
optimizations on this representation and the generation of optimized IR code.
Furthermore, we define an interface for connecting external optimizers and
present a novel way of using the parallelism they introduce to generate SIMD
and OpenMP code. To evaluate Polly, we compile the PolyBench 2.0 benchmarks
fully automatically with PLuTo as external optimizer and parallelizer. We can
report on significant speedups.
},
 url = {http://www.worldscientific.com/doi/abs/10.1142/S0129626412500107}
}

@InProceedings{Vasilache2012,
 author = 	 {Nicolas Vasilache and Benoit Meister and Muthu Baskaran and Richard Lethin},
 title =	{Joint Scheduling and Layout Optimization to Enable Multi-Level Vectorization},
 booktitle = {IMPACT},
 address = {Paris, France},
 month = 	 jan,
 year = 	 {2012}
}

@InProceedings{Verdoolaege2012,
 author = {Verdoolaege, Sven and Grosser, Tobias },
 title = {Polyhedral Extraction Tool},
 booktitle = { Second International Workshop on Polyhedral Compilation Techniques (IMPACT'12)},
 address = { Paris, France},
 month = jan,
 year = {2012},
 url = "http://impact.gforge.inria.fr/impact2012/workshop_IMPACT/verdoolaege.pdf",
 abstract = "
We present a new library for extracting a polyhedral model from C source. The
library is based on clang, the LLVM C frontend, and isl, a library for
manipulating quasi-affine sets and relations. The use of clang for parsing the
C code brings advanced diagnostics and full support for C99. The use of isl
allows for an easy construction and a powerful and compact representation of
the polyhedral model. Besides allowing arbitrary piecewise quasi-affine index
expressions and conditions, the library also supports some data dependent
constructs and has special treatment for unsigned integers. The library has
been successfully used to obtain polyhedral models for use in an equivalence
checker, a tool for constructing polyhedral process networks, a parallelizer
targeting GPUs and an interactive polyhedral environment.
"
}

@inproceedings{Yuki2012alphaz,
 title={{AlphaZ}: A System for Design Space Exploration in the Polyhedral Model},
 author={Yuki, Tomofumi and Gupta, Gautam and Kim, DaeGon and Pathan, Tanveer and Rajopadhye, Sanjay},
 booktitle={Proceedings of the 25th International Workshop on Languages and Compilers for Parallel Computing},
 year={2012},
 url={http://people.rennes.inria.fr/Tomofumi.Yuki/papers/yuki-lcpc2012.pdf},
 abstract={
The polyhedral model is now a well established and effective formalism for
program optimization and parallelization. However, finding optimal
transformations is a long-standing open problem. It is therefore important to
develop tools that, rather than following predefined optimization criteria,
allow practitioners to explore different choices through script-driven or
user-guided transformations. More than practitioners, such flexibility is even
more important for compiler researchers and auto-tuner developers. In addition,
tools must also raise the level of abstraction by representing and manipulating
reductions and scans explicitly. And third, the tools must also be able to
explore transformation choices that consider memory (re)-allocation.

AlphaZ is a system that allows exploration of optimizing transformations in the
polyhedral model that meets these goals. We illustrate its power through two
examples of optimizations that existing parallelization tools cannot perform,
but can be systematically applied using our system. One is time-tiling of a
code from PolyBench that resembles the Alternating Direction Implicit (ADI)
method, and the other is a transformation that brings down the algorithmic
complexity of a kernel in UNAfold, a sequence alignment software, from O(N^4)
to O(N^3).}
}

@misc{Pouchet2011polyopt,
  title={Polyopt, a polyhedral optimizer for the rose compiler},
  author={Pouchet, Louis-No{\"e}l},
  year={2011},
  publisher={July}
}

@InProceedings{Darte2011impact,
  author = 	 {Darte, Alain},
  title = 	 {Keynote: Approximations in the Polyhedral Model},
  booktitle =    {1st International Workshop on Polyhedral Compilation Techniques (IMPACT)},
  year = 	 {2011},
  editor = 	 {C. Alias and C. Bastoul},
  address = 	 {Chamonix, France},
  abstract = {
The polyhedral model is often viewed as limited, by nature, to a restricted
class of programs, the set of nested loops with static control, affine loop
counters, and affine accesses to arrays. But, actually, it is more than that.
It is really a "model", in the sense that it is indeed not the real life (of
programs) but it is nevertheless a useful representation that approximates it.
It is always beneficial to develop transformations and optimizations in the
polytope model with this perspective in mind, i.e., as being applicable to more
programs, more general situations, provided that these programs are
approximated in a suitable way. In this talk, I will recall several cases where
the polytope model can be used to handle approximated programs: the detection
of parallelism in loops with approximated dependences, the derivation of
optimized communications with local reuse, the reuse of memory with
lattice-based array folding, the analysis of while loops. Other important
approximations, which will not be covered, concern fuzzy data-flow analysis and
array region analysis, and the link with abstract interpretation.
}
}

@InProceedings{Pradelle2011impact,
  author = 	 {Pradelle, Benoît and Ketterlin, Alain and Clauss, Philippe},
  title = 	 {Transparent Parallelization of Binary Code},
  booktitle =    {1st International Workshop on Polyhedral Compilation Techniques (IMPACT)},
  year = 	 {2011},
  editor = 	 {C. Alias and C. Bastoul},
  address = 	 {Chamonix, France},
  abstract = {
This paper describes a system that applies automatic parallelization techniques
to binary code. The system works by raising x86-64 raw executable code to an
intermediate representation that exhibits all memory accesses and relevant
register definitions, but outlines detailed computations that are not relevant
for parallelization. It then uses an off-the-shelf polyhedral parallelizer,
firrst applying appropriate enabling transformations if necessary. The last
phase lowers the internal representation into a new executable fragment,
re-injecting low-level instructions into the transformed code. The system is
shown to leverage the power of polyhedral parallelization techniques in the
absence of source code, with performance approaching those of source-to-source
tools.
},
  url = {http://perso.ens-lyon.fr/christophe.alias/impact2011/impact-03.pdf},
}

@InProceedings{Ramakrishna2011impact,
  author = 	 {Upadrasta, Ramakrishna and Cohen, Albert},
  title = 	 {Potential and Challenges of Two-Variable-Per-Inequality Sub-Polyhedral Compilation},
  booktitle =    {1st International Workshop on Polyhedral Compilation Techniques (IMPACT)},
  year = 	 {2011},
  editor = 	 {C. Alias and C. Bastoul},
  address = 	 {Chamonix, France},
  abstract = {
We explore the potential of sub-polyhedra as a means to fight some of the
algorithmic complexity challenges in polyhedral compilation. The static
analysis community has produced a rich variety of sub-polyhedral abstractions,
trading precision for scalability for interprocedural analysis and symbolic
model-checking. In this work, we evaluate the potential of the
Two-Variable-Per-Inequality (TVPI) numerical abstract domain for polyhedral
compilation. We demonstrate that strongly polynomial-time algorithms can be
designed for the construction of complex affine transformations, with
applications to affine scheduling and partitioning, with various objectives
(parallelization, tiling, latency minimization). We identify some limitations,
questions and research directions to be explored in depth in what we think is a
new approach for the construction of scalable polyhedral compilation tools.
},
  url = {http://perso.ens-lyon.fr/christophe.alias/impact2011/impact-13.pdf}
}

@InProceedings{Konstantinidis2011impact,
  author = 	 {Athanasios Konstantinidis, Athanasios and Kelly, Paul H. J.},
  title = 	 {More Definite Results from the {P}luTo Scheduling Algorithm},
  booktitle =    {1st International Workshop on Polyhedral Compilation Techniques (IMPACT)},
  year = 	 {2011},
  editor = 	 {C. Alias and C. Bastoul},
  address = 	 {Chamonix, France},
  abstract = {
The PLuTo scheduling algorithm is a well-known algorithm for automatic
scheduling in the polyhedral compilation model.  It seeks linearly independent
affine partition mappings for each statement of a Static Control Program
(SCoP), such that total communication is minimized. In this paper we show that
this algorithm can be sensitive to the layout of the global constraint matrix
thus resulting to varying performances of our target code simply by changing
the order of the constraint coefficients. We propose a simple technique that
automatically determines the right layout for the constraints and as a result
immunizes the PLuTo scheduling algorithm from constraint ordering variations.
},
  url = {http://perso.ens-lyon.fr/christophe.alias/impact2011/impact-02.pdf},
}

@InProceedings{Verdoolaege2011impact,
  author = 	 {Verdoolaege, Sven},
  title = 	 {Counting Affine Calculator and Applications},
  booktitle =    {1st International Workshop on Polyhedral Compilation Techniques (IMPACT)},
  year = 	 {2011},
  editor = 	 {C. Alias and C. Bastoul},
  address = 	 {Chamonix, France},
  abstract = {
We present an interactive tool, called iscc, for manipulating sets and
relations of integer tuples bounded by aliasne constraints over the set
variables, parameters and existentially quantified variables. A distinguishing
feature of iscc is that it provides a cardinality operation on sets and
relations that computes a symbolic expression in terms of the parameters and
domain variables for the number of elements in the set or the image of the
relation. In particular, these expressions are piecewise quasipolynomials,
which can be further manipulated in iscc. Besides basic operations on sets and
piecewise quasipolynomials, iscc also provides an interface to code generation,
lexicographic optimization, dependence analysis, transitive closures and the
symbolic computation of upper bounds and sums of piecewise quasipolynomials
over their domains.
},
  url = {http://perso.ens-lyon.fr/christophe.alias/impact2011/impact-05.pdf}
}

@InProceedings{Amini2011impact,
  author = 	 {Amini, Mehdi and Ancourt, Corinne and Coelho, Fabien and Creusillet, B{\'e}atrice and Guelton, Serge and Irigoin, François and Jouvelot, Pierre and Keryell, Ronan and Villalon, Pierre},
  title = 	 {{PIPS} Is not (just) Polyhedral Software},
  booktitle =    {1st International Workshop on Polyhedral Compilation Techniques (IMPACT)},
  year = 	 {2011},
  editor = 	 {C. Alias and C. Bastoul},
  address = 	 {Chamonix, France},
  abstract = {
Parallel and heterogeneous computing are growing in audience thanks to the
increased performance brought by ubiquitous manycores and GPU s. However,
available programming models, like OPENCL or CUDA, are far from being
straightforward to use. As a consequence, several automated or semi-automated
approaches have been proposed to automatically generate hardware-level codes
from high-level sequential sources.

Polyhedral models are becoming more popular because of their combination of
expressiveness, compactness, and accurate abstraction of the data-parallel
behaviour of programs. These models provide automatic or semi-automatic
parallelization and code transformation capabilities that target such modern
parallel architectures.

PIPS is a quarter-century old source-to-source transformation framework that
initially targeted parallel machines but then evolved to include other targets.
PIPS uses abstract interpretation on an integer polyhedral lattice to represent
program code, allowing linear relation analysis on integer variables in an
interprocedural way. The same representation is used for the dependence test
and the convex array region analysis. The polyhedral model is also more
classically used to schedule code from linear constraints.

In this paper, we illustrate the features of this compiler infrastructure on an
hypothetical input code, demonstrating the combination of polyhedral and non
polyhedral transformations.  PIPS interprocedural polyhedral analyses are used
to generate data transfers and are combined with non-polyhedral transformations
to achieve efficient CUDA code generation.
},
  url = {http://perso.ens-lyon.fr/christophe.alias/impact2011/impact-09.pdf},
}

@InProceedings{Grosser2011impact,
  author = 	 {Grosser, Tobias and Zheng, Hongbin and Aloor, Ragesh and Simb{\"u}rger, Andreas  and Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l},
  title = 	 {Polly -- Polyhedral Optimization in {LLVM}},
  booktitle =    {1st International Workshop on Polyhedral Compilation Techniques (IMPACT)},
  year = 	 {2011},
  editor = 	 {C. Alias and C. Bastoul},
  address = 	 {Chamonix, France},
  abstract = {
Various powerful polyhedral techniques exist to optimize computation intensive
programs ectively. Applying these techniques on any non-trivial program is
still surprisingly dincult and often not as ective as expected. Most polyhedral
tools are limited to a specific programming language.  Even for this language,
relevant code needs to match specific syntax that rarely appears in existing
code. It is therefore hard or even impossible to process existing programs
automatically. In addition, most tools target C or OpenCL code, which prevents
ective communication with compiler internal optimizers. As a result target
architecture specific optimizations are either little ective or not approached
at all.

In this paper we present Polly, a project to enable polyhedral optimizations in
LLVM. Polly automatically detects and transforms relevant program parts in a
language-independent and syntactically transparent way. Therefore, it supports
programs written in most common programming languages and constructs like C++
iterators, goto based loops and pointer arithmetic. Internally it provides a
state-of-the-art polyhedral library with full support for Z -polyhedra,
advanced data dependency analysis and support for external optimizers. Polly
includes integrated SIMD and OpenMP code generation. Through LLVM, machine code
for CPUs and GPU accelerators, C source code and even hardware descriptions can
be targeted.
},
  url = {http://perso.ens-lyon.fr/christophe.alias/impact2011/impact-07.pdf},
}


@InCollection{Wonnacott2011,
  author =	 {David Wonnacott},
  title =	 {{O}mega Calculator/Library/Project/Test},
  booktitle =	 {Encyclopedia of Parallel Computing},
  pages =	 {1355-1365},
  publisher =	 {Springer},
  year =	 2011,
  editor =	 {David Padua}
}

@InCollection{Feautrier2011,
  author =	 {Paul Feautrier and Christian Lengauer},
  title =	 {The Polyhedron Model},
  booktitle =	 {Encyclopedia of Parallel Computing},
  pages =	 {1581-1592},
  publisher =	 {Springer},
  year =	 2011,
  editor =	 {David Padua}
}

@inproceedings{Baskaran2010ctocuda,
 title={Automatic C-to-CUDA code generation for affine programs},
 author={Baskaran, Muthu Manikandan and Ramanujam, Jj and Sadayappan, P},
 booktitle={Compiler Construction},
 pages={244--263},
 year={2010},
 organization={Springer},
 url = {http://www.researchgate.net/publication/221302775_Automatic_C-to-CUDA_Code_Generation_for_Affine_Programs/file/79e4150e7f97085734.pdf},
 abstract = {
Graphics Processing Units (GPUs) offer tremendous computational power. CUDA
(Compute Unified Device Architecture) provides a multi-threaded parallel
programming model, facilitating high performance implementations of
general-purpose computations. However, the explicitly managed memory hierarchy
and multi-level parallel view make manual development of high-performance CUDA
code rather complicated. Hence the automatic transformation of sequential input
programs into efficient parallel CUDA programs is of considerable interest.

This paper describes an automatic code transformation system that generates
parallel CUDA code from input sequential C code, for regular (affine) programs.
Using and adapting publicly available tools that have made polyhedral compiler
optimization practically effective, we develop a C-to-CUDA transformation
system that generates two-level parallel CUDA code that is optimized for
efficient data access. The performance of automatically generated code is
compared with manually optimized CUDA code for a number of benchmarks. The
performance of the automatically generated CUDA code is quite close to
hand-optimized CUDA code and considerably better than the benchmarks’
performance on a multicore CPU.
}
}


@article{Belaoucha2010fadalib,
 title={FADAlib: an open source C++ library for fuzzy array dataflow analysis},
 author={Belaoucha, Marouane and Barthou, Denis and Eliche, Adrien and Touati, Sid-Ahmed-Ali},
 journal={Procedia Computer Science},
 volume={1},
 number={1},
 pages={2075--2084},
 year={2010},
 publisher={Elsevier},
 url={http://www-sop.inria.fr/members/Sid.Touati/publis/Touati-PAPP2010.pdf},
 abstract={
Ubiquitous multicore architectures require that many levels of parallelism have
to be found in codes. Dependence analysis is the main approach in compilers for
the detection of parallelism. It enables vectorisation and automatic
parallelisation, among many other optimising transformations, and is therefore
of crucial importance for optimising compilers. This paper presents new open
source software, \texttt{FADAlib}, performing an instance-wise dataflow
analysis for scalar and array references. The software is a C++ implementation
of the Fuzzy Array Dataflow Analysis (FADA) method. This method can be applied
on codes with irregular control such as \texttt{while}-loops,
\texttt{if-then-else} or non-regular array accesses, and computes exact
instance-wise dataflow analysis on regular codes. As far as we know,
\texttt{FADAlib} is the first released open source C++ implementation of
instance-wise data flow dependence handling larger classes of programs. In
addition, the library is technically independent from an existing compiler; It
can be plugged in many of them; this article shows an example of a successful
integration inside gcc/GRAPHITE. We give details concerning the library
implementation and then report some initial results with gcc and possible use
for trace scheduling on irregular codes.
}
}

@article{Verdoolaege2010Experience,
 title={Experience with widening based equivalence checking in realistic multimedia systems},
 author={Verdoolaege, Sven and Palkovi{\v{c}}, Martin and Bruynooghe, Maurice and Janssens, Gerda and Catthoor, Francky},
 journal={Journal of Electronic Testing},
 volume={26},
 number={2},
 pages={279--292},
 year={2010},
 publisher={Springer},
 url={http://www.cs.kuleuven.be/publicaties/rapporten/cw/CW572.pdf},
 abstract={
The application of loop and data transformations to array and loop intensive
programs is crucial to obtain a good performance. Designers often apply these
transformations manually or semi-automatically. For the class of static affine
programs, automatic methods exist for proving the correctness of these
transformations. Realistic multimedia systems, however, often contain
constructs that fall outside of this class. We present an extension of a
widening based approach to handle the most relevant of these constructs, viz.
accesses to array slices, data dependent accesses and data dependent
assignments, and report on some experiments with non-trivial applications.
}
}

@inproceedings{Nadezhkin2010translating,
 title={Translating affine nested-loop programs with dynamic loop bounds into polyhedral process networks},
 author={Nadezhkin, Dmitry and Nikolov, Hristo and Stefanov, Todor},
 booktitle={Embedded Systems for Real-Time Multimedia (ESTIMedia), 2010 8th IEEE Workshop on},
 pages={21--30},
 year={2010},
 organization={IEEE},
 url={http://www.researchgate.net/publication/221536495_Translating_affine_nested-loop_programs_with_dynamic_loop_bounds_into_Polyhedral_Process_Networks/file/50463519f145f25b63.pdf},
 abstract={
The Process Network (PN) is a suitable parallel model of computation (MoC) used
to specify embedded streaming applications in a parallel form facilitating the
efficient mapping onto embedded parallel execution platforms. Unfortunately,
specifying an application using a parallel MoC is very difficult and highly
error-prone task. To overcome the associated difficulties, an automated
procedure exists for derivation of a specific polyhedral process networks (PPN)
from static affine nested loop programs (SANLPs). This procedure is implemented
in the pn complier. However, there are many applications, e.g., multimedia
applications (MPEG coders/decoders, smart cameras, etc.) that have adaptive and
dynamic behavior which can not be expressed as SANLPs. Therefore, in order to
handle more dynamic multimedia applications, in this paper we address the
important question whether we can relax some of the restrictions of the SANLPs
while keeping the ability to perform compile-time analysis and to derive PPNs.
Achieving this would significantly extend the range of applications that can be
parallelized in an automated way. The main contribution of this paper is a
first approach for automated translation of affine nested loops programs with
dynamic loop bounds into input-output equivalent polyhedral process networks.
}
}

@inproceedings{Bondhugula2010xlc,
 author    = {Bondhugula, Uday and
              G{\"u}nl{\"u}k, Oktay and
              Dash, Sanjeeb and
              Renganarayanan, Lakshminarayanan},
 title     = {A model for fusion and code motion in an automatic parallelizing
              compiler},
 booktitle = {International Conference on Parallel Architectures and Compilation Techniques (PACT)},
 year      = {2010},
 pages     = {343-352},
 url = "http://drona.csa.iisc.ernet.in/~uday/publications/uday-pact10.pdf",
 abstract = "
Loop fusion has been studied extensively, but in a manner isolated from other
transformations. This was mainly due to the lack of a powerful intermediate
representation for application of compositions of high-level transformations.
Fusion presents strong interactions with parallelism and locality. Currently,
there exist no models to determine good fusion structures integrated with all
components of an auto-parallelizing compiler. This is also one of the reasons
why all the benefits of optimization and automatic parallelization of long
sequences of loop nests spanning hundreds of lines of code have never been
explored.

We present a fusion model in an integrated automatic parallelization framework
that simultaneously optimizes for hardware prefetch stream buffer utilization,
locality, and parallelism. Characterizing the legal space of fusion structures
in the polyhedral compiler framework is not difficult. However, incorporating
useful optimization criteria into such a legal space to pick good fusion
structures is very hard. The model we propose captures utilization of hardware
prefetch streams, loss of parallelism, as well as constraints imposed by
privatization and code expansion into a single convex optimization space. The
model scales very well to program sections spanning hundreds of lines of code.
It has been implemented into the polyhedral pass of the IBM XL optimizing
compiler. Experimental results demonstrate its effectiveness in finding good
fusion structures for codes including SPEC benchmarks and large applications.
An improvement ranging from 5% to nearly a factor of 2.75x is obtained over the
current production compiler optimizer on these benchmarks.
"
}

@InProceedings{Trifinovic2010,
 author = {Trifunovi\'c, Konrad and Cohen, Albert and Edelsohn, David
           and Li, Feng and Grosser, Tobias and Jagasia, Harsha
           and Ladelsky, Razya and Pop, Sebastian and Sj\"odin, Jan
           and Upadrasta, Ramakrishna },
 title = {{GRAPHITE} Two Years After: First Lessons Learned From
           Real-World Polyhedral Compilation},
 booktitle = {2nd GCC Research Opportunities Workshop (GROW)},
 year = 2010,
 url = "http://citeseerx.ist.psu.edu/viewdoc/download?rep=rep1&type=pdf&doi=10.1.1.220.3386",
 abstract = "
Modern compilers are responsible for adapting the semantics of source programs
into a form that makes efficient use of a highly complex, heterogeneous
machine. This adaptation amounts to solve an optimization problem in a huge and
unstructured search space, while predicting the performance outcome of complex
sequences of program transformations. The polyhedral model of compilation is
aimed at these challenges. Its geometrical, non-inductive semantics enables the
construction of better-structured optimization problems and precise analytical
models. Recent work demonstrated the scalability of the main polyhedral
algorithms to real-world programs. Its integration into production compilers is
under way, pioneered by the Graphite branch of the GNU Compiler Collection
(GCC). Two years after the effective beginning of the project, this paper
reports on original questions and innovative solutions that arose during the
design and implementation of Graphite.
"
}


@inproceedings{Verdoolaege2010isl,
  author =	 {Verdoolaege, Sven},
  title =	 {isl: An Integer Set Library for the Polyhedral Model},
  booktitle =	 {Mathematical Software (ICMS'10)},
  year =	 2010,
  pages =	 {299--302},
  series =	 {LNCS 6327},
  editor =	 {Fukuda, Komei and Hoeven, Joris and Joswig, Michael and
                  Takayama, Nobuki},
  publisher =	 {Springer-Verlag},
}

@inproceedings{Benabderrahmane2010polyhedral,
  title={The polyhedral model is more widely applicable than you think},
  author={Benabderrahmane, Mohamed-Walid and Pouchet, Louis-No{\"e}l and Cohen, Albert and Bastoul, C{\'e}dric},
  booktitle={Compiler Construction},
  pages={283--303},
  year={2010},
  organization={Springer},
  abstract={
The polyhedral model is a powerful framework for automatic optimization and
parallelization. It is based on an algebraic representation of programs,
allowing to construct and search for complex sequences of optimizations. This
model is now mature and reaches production compilers. The main limitation of
the polyhedral model is known to be its restriction to statically predictable,
loop-based program parts. This paper removes this limitation, allowing to
operate on general data-dependent control-flow. We embed control and exit
predicates as first-class citizens of the algebraic representation, from
program analysis to code generation. Complementing previous (partial) attempts
in this direction, our work concentrates on extending the code generation step
and does not compromise the expressiveness of the model. We present
experimental evidence that our extension is relevant for program optimization
and parallelization, showing performance improvements on benchmarks that were
thought to be out of reach of the polyhedral model.
},
  pdf={http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.162.407&rep=rep1&type=pdf}
}

@inproceedings{Reservoir2010,
 author = {Leung, Allen and Vasilache, Nicolas and Meister, Beno\^{\i}t and
Baskaran, Muthu and Wohlford, David and Bastoul, C{\'e}dric and Lethin,
Richard},
 title = {A mapping path for multi-{GPGPU} accelerated computers from a
portable high level programming abstraction},
 booktitle = {3rd Workshop on General-Purpose Computation on Graphics
Processing Units (GPGPU)},
 series = {GPGPU '10},
 year = {2010},
 publisher = {ACM},
 address = {New York, NY, USA},
 url = "http://www.researchgate.net/publication/220938996_A_mapping_path_for_multi-GPGPU_accelerated_computers_from_a_portable_high_level_programming_abstraction/file/79e41502512b8a3f2b.pdf",
 abstract = "
Programmers for GPGPU face rapidly changing substrate of programming
abstractions, execution models, and hardware implementations. It has been
established, through numerous demonstrations for particular conjunctions of
application kernel, programming languages, and GPU hardware instance, that it
is possible to achieve significant improvements in the price/performance and
energy/performance over general purpose processors. But these demonstrations
are each the result of significant dedicated programmer labor, which is likely
to be duplicated for each new GPU hardware architecture to achieve performance
portability.

This paper discusses the implementation, in the R-Stream compiler, of a source
to source mapping pathway from a high-level, textbook-style algorithm
expression method in ANSI C, to multi-GPGPU accelerated computers. The compiler
performs hierarchical decomposition and parallelization of the algorithm
between and across host, multiple GPGPUs, and within-GPU. The semantic
transformations are expressed within the polyhedral model, including
optimization of integrated parallelization, locality, and contiguity tradeoffs.
Hierarchical tiling is performed. Communication and synchronizations operations
at multiple levels are generated automatically. The resulting mapping is
currently emitted in the CUDA programming language.

The GPU backend adds to the range of hardware and accelerator targets for
R-Stream and indicates the potential for performance portability of single
sources across multiple hardware targets.
"
}

@inproceedings{Yuki2010TileSizeModels,
 author = {Yuki, Tomofumi and Renganarayanan, Lakshminarayanan and Rajopadhye, Sanjay and Anderson, Charles and Eichenberger, Alexandre E. and O'Brien, Kevin},
 title = {Automatic Creation of Tile Size Selection Models},
 booktitle = {Proceedings of the 8th Annual IEEE/ACM International Symposium on Code Generation and Optimization},
 series = {CGO '10},
 year = {2010},
 isbn = {978-1-60558-635-9},
 location = {Toronto, Ontario, Canada},
 pages = {190--199},
 numpages = {10},
 url = {http://www.cs.colostate.edu/~yuki/papers/yuki-cgo2010.pdf},
 doi = {10.1145/1772954.1772982},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {machine learning, neural network, performance modeling, tiling},
 abstract = {
Tiling is a widely used loop transformation for exposing/exploiting parallelism
and data locality. Effective use of tiling requires selection and tuning of the
tile sizes. This is usually achieved by hand-crafting tile size selection (TSS)
models that characterize the performance of the tiled program as a function of
tile sizes. The best tile sizes are selected by either directly using the TSS
model or by using the TSS model together with an empirical search.
Hand-crafting accurate TSS models is hard, and adapting them to different
architecture/compiler, or even keeping them up-to-date with respect to the
evolution of a single compiler is often just as hard. Instead of hand-crafting
TSS models, can we automatically learn or create them? In this paper, we show
that for a specific class of programs fairly accurate TSS models can be
automatically created by using a combination of simple program features,
synthetic kernels, and standard machine learning techniques. The automatic TSS
model generation scheme can also be directly used for adapting the model and/or
keeping it up-to-date. We evaluate our scheme on six different
architecture-compiler combinations (chosen from three different architectures
and four different compilers). The models learned by our method have
consistently shown near-optimal performance (within 5% of the optimal on
average) across all architecture-compiler combinations.
}
} 


@inproceedings{Grosslinger2009,
 author = {Gr\"{o}{\ss}linger, Armin},
 title = {Precise Management of Scratchpad Memories for Localising Array Accesses in Scientific Codes},
 booktitle = {Proceedings of the 18th International Conference on Compiler Construction: Held As Part of the Joint European Conferences on Theory and Practice of Software, ETAPS 2009},
 series = {CC '09},
 year = {2009},
 pages = {236--250},
 numpages = {15},
 url = {http://dx.doi.org/10.1007/978-3-642-00722-4_17},
 doi = {10.1007/978-3-642-00722-4_17},
 publisher = {Springer-Verlag},
 address = {Berlin, Heidelberg},
 abstract = {
Unlike desktop and server CPUs, special-purpose processors found in embedded
systems and on graphics cards often do not have a cache memory which is
managed automatically by hardware logic. Instead, they offer a so-called
scratchpad memory which is fast like a cache but, unlike a cache, has to be
managed explicitly, i.e., the burden of its efficient use is imposed on the
software. We present a method for computing precisely which memory cells are
reused due to temporal locality of a certain class of codes, namely codes
which can be modelled in the well-known polyhedron model. We present some
examples demonstrating the effectiveness of our method for scientific codes.
}
}

@inproceedings{Sjodin2009design,
 title = {Design of graphite and the polyhedral compilation package},
 author = {Sj{\"o}din, Jan and Pop, Sebastian and Jagasia, Harsha and Grosser, Tobias and Pop, Antoniu and others},
 booktitle = {GCC Developers' Summit},
 year = {2009},
 abstract = "
Graphite is the loop transformation framework that was introduced in GCC 4.4.
This paper gives a detailed description of the design and future directions of
this infrastructure. Graphite uses the polyhedral model as the internal
representation (GPOLY). The plan is to create a polyhedral compilation package
(PCP) that will provide loop optimization and analysis capabilities to GCC.
This package will be separated from GIMPLE via an interface language that is
restricted to express only what GPOLY can represent. The interface language is
a set of data structures that encodes the control flow and memory accesses of a
code region. A syntax for the language is also defined to facilitate debugging
and testing.",
 url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.158.2112&rep=rep1&type=pdf},
}

@PhdThesis{Pouchet2010,
  author = 	 {Louis-No{\"e}l Pouchet},
  title = 	 {Interative Optimization in the Polyhedral Model},
  school = 	 {University of Paris-Sud 11},
  year = 	 {2010},
  address = 	 {Orsay, France},
  month = 	 Jan
}

@article{Baskaran2009,
 author = {Baskaran, Muthu Manikandan and Vydyanathan, Nagavijayalakshmi and
           Bondhugula, Uday Kumar Reddy and Ramanujam, J. and Rountev, Atanas
           and Sadayappan, P.},
 title = {Compiler-assisted dynamic scheduling for effective parallelization of
          loop nests on multicore processors},
 journal = {SIGPLAN Notices},
 issue_date = {April 2009},
 volume = {44},
 number = {4},
 month = feb,
 year = {2009},
 issn = {0362-1340},
 pages = {219--228},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1594835.1504209},
 doi = {10.1145/1594835.1504209},
 acmid = {1504209},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {compile-time optimization, dynamic scheduling, run-time
             optimization},
 abstract = {
  Recent advances in polyhedral compilation technology have made it feasible to
  automatically transform affine sequential loop nests for tiled parallel
  execution on multi-core processors. However, for multi-statement input
  programs with statements of different dimensionalities, such as Cholesky or
  LU decomposition, the parallel tiled code generated by existing automatic
  parallelization approaches may suffer from significant load imbalance,
  resulting in poor scalability on multi-core systems. In this paper, we develop
  a completely automatic parallelization approach for transforming input affine
  sequential codes into efficient parallel codes that can be executed on a
  multi-core system in a load-balanced manner. In our approach, we employ a
  compile-time technique that enables dynamic extraction of inter-tile
  dependences at run-time, and dynamic scheduling of the parallel tiles on the
  processor cores for improved scalable execution. Our approach obviates the
  need for programmer intervention and re-writing of existing algorithms for
  efficient parallel execution on multi-cores. We demonstrate the usefulness of
  our approach through comparisons using linear algebra computations: LU and
  Cholesky decomposition.
 }
}

@inproceedings{Verdoolaege2009equivalence,
 author = "Verdoolaege, Sven and Janssens, Gerda and Bruynooghe, Maurice",
 title = "Equivalence checking of static affine programs using widening to handle recurrences",
 booktitle = "Computer Aided Verification 21",
 month = Jun,
 year = 2009,
 pages = "599--613",
 publisher = "Springer",
 url = "http://www.cs.kuleuven.ac.be/publicaties/rapporten/cw/CW565.pdf",
 abstract = "
Designers often apply manual or semi-automatic loop and data transformations on
array and loop intensive programs to improve performance. The transformations
should preserve the functionality, however, and this paper presents an
automatic method for constructing equivalence proofs for the class of static
affine programs. The equivalence checking is performed on a dependence graph
abstraction and uses a new approach based on widening to handle recurrences.
Unlike transitive closure based approaches, this widening approach can also
handle non-uniform recurrences. The implementation is publicly available and is
the first of its kind to fully support commutative operations.
"
}

@article{Bagnara2008parma,
 title={The Parma Polyhedra Library: Toward a complete set of numerical abstractions for the analysis and verification of hardware and software systems},
 author={Bagnara, Roberto and Hill, Patricia M and Zaffanella, Enea},
 journal={Science of Computer Programming},
 volume={72},
 number={1},
 pages={3--21},
 year={2008},
 publisher={Elsevier},
 url={http://bugseng.com/products/ppl/documentation/BagnaraHZ08SCP.pdf},
 abstract={
Since its inception as a student project in 2001, initially just for the
handling (as the name implies) of convex polyhedra, the Parma Polyhedra Library
has been continuously improved and extended by joining scrupulous research on
the theoretical foundations of (possibly non-convex) numerical abstractions to
a total adherence to the best available practices in software development. Even
though it is still not fully mature and functionally complete, the Parma
Polyhedra Library already offers a combination of functionality, reliability,
usability and performance that is not matched by similar, freely available
libraries. In this paper, we present the main features of the current version
of the library, emphasizing those that distinguish it from other similar
libraries and those that are important for applications in the field of
analysis and verification of hardware and software systems.
}
}

@misc{Bastoul2008clan,
  title={Clan - A polyhedral representation extractor for high level programs},
  author={Bastoul, Cedric},
  year={2008},
}

@article{Bondhugula2008Pluto,
 author = {Bondhugula, Uday and Hartono, Albert and Ramanujam, J. and Sadayappan, P.},
 title = {A practical automatic polyhedral parallelizer and locality optimizer},
 journal = {SIGPLAN Notices},
 volume = {43},
 number = {6},
 year = {2008},
 issn = {0362-1340},
 pages = {101--113},
 doi = {http://doi.acm.org/10.1145/1379022.1375595},
 publisher = {ACM},
 address = {New York, NY, USA},
}

@inproceedings{Bondhugula08cc,
    author = {Uday Bondhugula and Muthu Baskaran and Sriram
        Krishnamoorthy and J. Ramanujam and A. Rountev and P.
            Sadayappan},
    title = {Automatic Transformations for Communication-Minimized
        Parallelization and Locality Optimization in the Polyhedral Model},
    booktitle = {International Conference on Compiler Construction
        (ETAPS CC)},
    year = 2008,
    month = apr,
    url = {http://drona.csa.iisc.ernet.in/~uday/publications/uday-cc08.pdf},
    abstract = {

    The polyhedral model provides powerful abstractions to optimize loop nests with 
    regular accesses.  Affine transformations in this model capture a complex
    sequence of execution-reordering loop transformations that can
    improve performance by parallelization as well as locality
    enhancement.  Although a significant body of research has addressed
    affine scheduling and partitioning, the problem of automatically
    finding good affine transforms for communication-optimized
    coarse-grained parallelization together with locality optimization
    for the general case of arbitrarily-nested loop sequences remains a
    challenging problem.

    We propose an automatic transformation framework to
    optimize arbitrarily-nested loop sequences with affine dependences
    for parallelism and locality simultaneously. The approach finds
    good tiling hyperplanes by embedding a powerful and versatile cost
    function into an Integer Linear Programming formulation. These
    tiling hyperplanes are used for communication-minimized
    coarse-grained parallelization as well as for locality
    optimization. The approach enables the minimization of inter-tile
    communication volume in the processor space, and minimization of
    reuse distances for local execution at each node. Programs
    requiring one-dimensional versus multi-dimensional time schedules
    (with scheduling-based approaches) are all handled with the same
    algorithm.  Synchronization-free parallelism, permutable loops or
    pipelined parallelism at various levels can be detected.
    Preliminary studies of the framework show promising results.
    }
}

@TECHREPORT{Chen2008chill,
 author = {Chen, Chun and Chame, Jacqueline and Hall, Mary},
 title = {A framework for composing high-level loop transformations},
 institution = {USC},
 year = {2008},
 url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.214.8396&rep=rep1&type=pdf",
 abstract = "
This paper describes a general and robust loop transformation framework that
enables compilers to generate efficient code on complex loop nests. Despite two
decades of prior research on loop optimization, performance of
compiler-generated code often falls short of manually optimized versions, even
for some well-studied BLAS kernels. There are two primary reasons for this.
First, today’s compilers employ fixed transformation strategies, making it
difficult to adapt to different optimization requirements for different
application codes. Second, code transformations are treated in isolation, not
taking into account the interactions between different transformations. This
paper addresses such limitations in a unified framework that supports a broad
collection of transformations, (permutation, tiling, unroll-and-jam, data
copying, iteration space splitting, fusion, distribution and others), which go
beyond existing polyhedral transformation models. This framework is a key
element of a compiler we are developing which performs empirical optimization
to evaluate a collection of alternative optimized variants of a code segment. A
script interface to code generation and empirical search permits transformation
parameters to be adjusted independently and tested; alternative scripts are
used to represent different code variants. By applying this framework to
example codes, we show performance results on automaticallygenerated code for
the Pentium M and MIPS R10000 that are comparable to the best hand-tuned codes,
and significantly better (up to a 14x speedup) than the native compilers.
"
}

@InProceedings{Pouchet2008Iterative,
  author = 	 {Louis-No{\"e}l Pouchet and C{\'e}dric Bastoul and
                  Albert Cohen and John Cavazos },
  title = 	 { Iterative optimization in the polyhedral model: Part {II},
  multidimensional time },
  booktitle = {ACM SIGPLAN Conference on Programming Language Design and Implementation (PLDI'08)},
  year = 	 { 2008 },
  pages = 	 {90--100},
  address = 	 { Tucson, Arizona },
  month = 	 { June},
  publisher =	 { ACM Press },
  url = {http://www.cse.ohio-state.edu/~pouchet/doc/pldi-article.08.pdf}
}

@inproceedings{Renganarayana2008positivity,
  title={Positivity, posynomials and tile size selection},
  author={Renganarayana, Lakshminarayanan and Rajopadhye, Sanjay},
  booktitle={High Performance Computing, Networking, Storage and Analysis, 2008. SC 2008. International Conference for},
  pages={1--12},
  year={2008},
  organization={IEEE},
  abstract={
Tiling is a widely used loop transformation for exposing/exploiting parallelism
and data locality. Effective use of tiling requires selection and tuning of the
tile sizes. This is usually achieved by developing cost models that
characterize the performance of the tiled program as a function of tile sizes.
All previous approaches to tile size selection (TSS) are cost model specific.
Due to this they are neither extensible (e.g., to richer program classes/newer
architectures) nor scalable (e.g., to multiple levels of tiling). This paper
identifies positivity as a fundamental property shared by the functions and
parameters commonly used in TSS models. We show how this positivity can be used
as a basis to derive a TSS framework which is both efficient and scalable. We
also show that almost all TSS models proposed in the literature (including
those used in production compilers and auto-tuners) can be reduced to our
framework.
}
}


@inproceedings{Alias2007bee,
 title={Bee+ cl@ k: An implementation of lattice-based array contraction in the source-to-source translator rose},
 author={Alias, Christophe and Baray, Fabrice and Darte, Alain},
 booktitle={ACM SIGPLAN Notices},
 volume={42},
 number={7},
 pages={73--82},
 year={2007},
 organization={ACM},
 url = {http://perso.ens-lyon.fr/christophe.alias/pub/lctes2007.pdf},
 abstract = {
 We build on prior work on intra-array memory reuse, for which a general
theoretical framework was proposed based on lattice theory. Intra-array memory
reuse is a way of reducing the size of a temporary array by folding, thanks to
affine mappings and modulo operations, reusing memory locations when they
contain a value not used later. We describe the algorithms needed to implement
such a strategy. Our implementation has two parts. The first part, Bee, uses
the source-to-source transformer ROSE to extract from the program all necessary
information on the lifetime of array elements and to generate the code after
memory reduction. The second part, Cl@k, is a stand-alone mathematical tool
dedicated to optimizations on polyhedra, in particular the computation of
successive minima and the computation of good admissible lattices, which are
the basis for lattice-based memory reuse. Both tools are developed in C++ and
use linear programming and polyhedra manipulations. They can be used either for
embedded program optimizations, e.g., to limit memory expansion introduced for
parallelization, or in high-level synthesis, e.g., to design memories between
communicating hardware accelerators.
}
}
@inproceedings{Kim2007Multi,
 author = {Kim, DaeGon and Renganarayanan, Lakshminarayanan and Rostron, Dave
and Rajopadhye, Sanjay and Strout, Michelle Mills},
 title = {Multi-level Tiling: M for the Price of One},
 booktitle = {Proceedings of the 2007 ACM/IEEE Conference on Supercomputing},
 series = {SC '07},
 year = {2007},
 isbn = {978-1-59593-764-3},
 location = {Reno, Nevada},
 pages = {51:1--51:12},
 articleno = {51},
 numpages = {12},
 doi = {10.1145/1362622.1362691},
 url = {http://www.cs.colostate.edu/~mstrout/Papers/Papers05-09/SC07-multilevel.pdf},
 abstract = {
Tiling is a widely used loop transformation for exposing/exploiting parallelism
and data locality. High-performance implementations use multiple levels of
tiling to exploit the hierarchy of parallelism and cache/register locality.
Efficient generation of multi-level tiled code is essential for effective use
of multi-level tiling. Parameterized tiled code, where tile sizes are not fixed
but left as symbolic parameters can enable several dynamic and run-time
optimizations. Previous solutions to multi-level tiled loop generation are
limited to the case where tile sizes are fixed at compile time. We present an
algorithm that can generate multi-level parameterized tiled loops at the same
cost as generating single-level tiled loops. The efficiency of our method is
demonstrated on several benchmarks. We also present a method-useful in register
tiling-for separating partial and full tiles at any arbitrary level of tiling.
The code generator we have implemented is available as an open source tool.}
}

@InProceedings{Pouchet2007Iterative,
  author = 	 { Louis-No{\"e}l Pouchet and C{\'e}dric Bastoul and Albert
  Cohen and Nicolas Vasilache },
  title =        { Iterative optimization in the polyhedral model: Part {I},
  one-dimensional time },
  booktitle =    {IEEE/ACM Fifth International Symposium on Code Generation and
  Optimization (CGO'07)},
  year = 	 { 2007 },
  publisher =    { IEEE Computer Society press },
  pages =        { 144--156 },
  address = 	 { San Jose, California },
  month = 	 { March },
  url = {http://www.cse.ohio-state.edu/~pouchet/doc/cgo-article.07.pdf}
}

@article{Verdoolaege2007pn,
 title={PN: a tool for improved derivation of process networks},
 author={Verdoolaege, Sven and Nikolov, Hristo and Stefanov, Todor},
 journal={EURASIP journal on Embedded Systems},
 volume={2007},
 number={1},
 pages={19--19},
 year={2007},
 publisher={Hindawi Publishing Corp.},
 url= {http://artemisia.liacs.nl/publications/Verdoolaege2007PN.pdf},
 abstract = "
Current emerging embedded System-on-Chip platforms are increasingly becoming
multiprocessor architectures. System designers experience significant
difficulties in programming these platforms. The applications are typically
specified as sequential programs that do not reveal the available parallelism
in an application, thereby hindering the effcient mapping of an application
onto a parallel multiprocessor platform. In this paper, we present our compiler
techniques for facilitating the migration from a sequential application
specification to a parallel application specification using the process network
model of computation. Our work is inspired by a previous research project
called Compaan. With our techniques we address optimization issues such as the
generation of process networks with simplified topology and communication
without sacrificing the process networks' performance. Moreover, we describe a
technique for compile-time memory requirement estimation which we consider as
an important contribution of this paper. We demonstrate the usefulness of our
techniques on several examples.
"
}

@article{Verdoolaege2007parametric,
 author = "Verdoolaege, Sven and Seghir, Rachid and Beyls, Kristof and Loechner, Vincent and Bruynooghe, Maurice",
 title = "Counting integer points in parametric polytopes using {Barvinok}'s rational functions",
 journal = "Algorithmica",
 year = 2007,
 volume = 48,
 number = 1,
 month = jun,
 publisher = "Springer New York",
 pages = "37--66",
 url = {http://dtai.cs.kuleuven.be/publications/files/41970.pdf},
 abstract = "
Many compiler optimization techniques depend on the ability to calculate the
number of elements that satisfy certain conditions. If these conditions can be
represented by linear constraints, then such problems are equivalent to
counting the number of integer points in (possibly) parametric polytopes. It is
well known that the enumerator of such a set can be represented by an explicit
function consisting of a set of quasi-polynomials, each associated with a
chamber in the parameter space. Previously, interpolation was used to obtain
these quasi-polynomials, but this technique has several disadvantages. Its
worst-case computation time for a single quasi-polynomial is exponential in the
input size, even for fixed dimensions. The worst-case size of such a
quasi-polynomial (measured in bits needed to represent the quasi-polynomial) is
also exponential in the input size. Under certain conditions this technique
even fails to produce a solution. Our main contribution is a novel method for
calculating the required quasi-polynomials analytically. It extends an existing
method, based on Barvinok's decomposition, for counting the number of integer
points in a non-parametric polytope. Our technique always produces a solution
and computes polynomially-sized enumerators in polynomial time (for fixed
dimensions). 
"
}

@inproceedings{Krishnamoorthy2007,
 author = {Krishnamoorthy, Sriram and Baskaran, Muthu and Bondhugula, Uday and Ramanujam, J. and Rountev, Atanas and Sadayappan, P.},
 title = {Effective automatic parallelization of stencil computations},
 booktitle = {Conference on Programming Language Design and Implementation (PLDI)},
 year = {2007},
 pages = {235--244},
 url = {http://drona.csa.iisc.ernet.in/~uday/publications/uday-pldi07.pdf},
 abstract = {
Performance optimization of stencil computations has been widely studied in the
literature, since they occur in many computationally intensive scientific and
engineering applications. Compiler frameworks have also been developed that can
transform sequential stencil codes for optimization of data locality and
parallelism. However, loop skewing is typically required in order to tile
stencil codes along the time dimension, resulting in load imbalance in
pipelined parallel execution of the tiles. In this paper, we develop an
approach for automatic parallelization of stencil codes, that explicitly
addresses the issue of load-balanced execution of tiles. Experimental results
are provided that demonstrate the effectiveness of the approach.
}
}

@phdthesis{Vasilache2007PhD,
 author = {Vasilache, Nicolas T.},
 keywords = {model, optimization, polyhedral, scalable, thesis, vasilache},
 month = {September},
 school = {Universit\'{e} Paris Sud XI, Orsay},
 title = {Scalable Program Optimization Techniques in the Polyhedral Model},
 year = {2007},
 url = {http://www.researchgate.net/publication/230759973_Scalable_Program_Optimization_Techniques_in_the_Polyhedral_Model/file/9fcfd503fb50e9ef18.pdf},
 abstract = {
Limited by ever-increasing power consumption and control complexity, current
processors have evolved to multiprocessor architectures on a chip with
increasingly many cores per chip and multiple threads per core. Compilers are
responsible for translating the idealistic operational semantics of the source
program into a form that makes efficient use of a highly complex heterogeneous
machine.

The polyhedral model is a sound mathematical abstraction to represent programs
with affine control loops. It alleviates many tradeoffs hampering current
optimizing compilers.  In the first part of this thesis, we discuss the
problems faced by compilers. We abstract from these and show how a
semi-automatic approach alleviates the user from producing complex code for
long sequences of transformations. In the second part, we discuss the
scheduling problem. We provide a new formulation to express the set of all
legal multidimensional schedules in a single optimization problem. We take a
fresh look at dependence analysis and program legality. This allows us to
devise a complex sequences automatic correction process to fix illegal
transformations. In the third part we discuss the code generation issues and
build the first fully reentrant framework based on the polyhedral model. We
introduce the concept of code generation optimizing transformations and
schedule equivalence to tailor syntactical optimizations at the finest possible
grain.  All the contributions have been implemented either in the URU K
framework on top of the Open64 compiler, or as part of IBM’s XLC compiler.
}
}

@article{Schweitz2006rstream,
 title={R-stream: A parametric high level compiler},
 author={Schweitz, Eric and Lethin, Richard and Leung, Allen and Meister, Benoit},
 journal={Proceedings of HPEC},
 year={2006},
 url={http://llwww.ll.mit.edu/HPEC/agendas/proc06/Day2/21_Schweitz_Pres.pdf},
}

@inproceedings{Vasilache2006real,
 author = {Nicolas Vasilache and
           C{\'e}dric Bastoul and
           Albert Cohen},
 title = {Polyhedral Code Generation in the Real World},
 year = {2006},
 pages = {185-201},
 booktitle = {International Conference on Compiler Construction (CC)},
 address = {Vienna},
 publisher = {Springer},
 series = {LNCS},
 volume = {3923},
 url = "http://icps.u-strasbg.fr/~bastoul/research/papers/VBC06-CC.pdf",
 abstract = "
The polyhedral model is known to be a powerful framework to reason about high
level loop transformations. Recent developments in optimizing compilers broke
some generally accepted ideas about the limitations of this model. First,
thanks to advances in dependence analysis for irregular access patterns, its
applicability which was supposed to be limited to very simple loop nests has
been extended to wide code regions. Then, new algorithms made it possible to
compute the target code for hundreds of statements while this code generation
step was expected not to be scalable. Such theoretical advances and new
software tools allowed actors from both academia and industry to study more
complex and realistic cases. Unfortunately, despite strong optimization
potential of a given transformation for e.g., parallelism or data locality,
code generation may still be challenging or result in high control overhead.
This paper presents scalable code generation methods that make possible the
application of increasingly complex program transformations. By studying the
transformations themselves, we show how it is possible to benefit from their
properties to dramatically improve both code generation quality and space/time
complexity, with respect to the best state-of-the-art code generation tool. In
addition, we build on these improvements to present a new algorithm improving
generated code performance for strided domains and reindexed schedules.
"
}

@inproceedings{Pop2006graphite,
 title={GRAPHITE: Polyhedral analyses and optimizations for GCC},
 author={Pop, Sebastian and Cohen, Albert and Bastoul, C{\'e}dric and Girbal, Sylvain and Silber, Georges-Andr{\'e} and Vasilache, Nicolas},
 booktitle={Proceedings of the 2006 GCC Developers Summit},
 pages={2006},
 year={2006},
 pdf = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.307.1260&rep=rep1&type=pdf#page=185},
 abstract = {
We present a plan to add loop nest optimizations in GCC based on polyhedral
representations of loop nests. We advocate a static analysis approach based on
a hierarchy of interchangeable abstractions with solvers that range from the
exact solvers such as OMEGA, to faster but less precise solvers based on more
coarse abstractions. The intermediate representation GRAPHITE (GIMPLE
Represented as Polyhedra with Interchangeable Envelopes), built on GIMPLE and
the natural loops, hosts the high level loop transformations. We base this
presentation on the WRaP-IT project developed in the Alchemy group at INRIA
Futurs and Paris-Sud University, on the PIPS compiler developed at École des
mines de Paris, and on a joint work with several members of the static analysis
and polyhedral compilation community in France. The main goal of this project
is to bring more high level loop optimizations to GCC: loop fusion, tiling,
strip mining, etc. Thanks to the WRaP-IT experience, we know that the
polyhedral analyzes and transformations are affordable in a production
compiler. A second goal of this project is to experiment with compile time
reduction versus attainable precision when replacing operations on polyhedra
with faster operations on more abstract domains. However, the use of a too
coarse representation for computing might also result in an over approximated
solution that cannot be used in subsequent computations. There exists a trade
off between speed of the computation and the attainable precision that has not
yet been analyzed for real world programs.
}
}


@article{Girbal2006,
 author = {Girbal, Sylvain and Vasilache, Nicolas and Bastoul, C{\'e}dric and Cohen, Albert and Parello, David and Sigler, Marc and Temam, Olivier},
 title = {Semi-automatic composition of loop transformations for deep parallelism and memory hierarchies},
 journal = {Int. J. Parallel Program.},
 issue_date = {June 2006},
 volume = {34},
 number = {3},
 month = jun,
 year = {2006},
 issn = {0885-7458},
 pages = {261--317},
 numpages = {57},
 doi = {10.1007/s10766-006-0012-3},
 acmid = {1165153},
 publisher = {Kluwer Academic Publishers},
 address = {Norwell, MA, USA},
 keywords = {automatic parallelization, compiler optimization, polyhedral model, semi-automatic program transformation},
}

@phdthesis{Stefanov2004phd,
  title={Converting weakly dynamic programs to equivalent process network specifications},
  author={Stefanov, Todor},
  year={2004},
  publisher={T. Stefanov}
}

@PhdThesis{Bastoul2004PhD,
 author = {Bastoul, C\'{e}dric},
 title = {Improving Data Locality in Static Control Programs},
 school = {University Paris 6, Pierre et Marie Curie, France},
 month = dec,
 year = 2004,
 abstract = {
  Cache memories were invented to decouple fast processors from slow memories.
  However, this decoupling is only partial and many researchers have attempted to
  improve cache use by program optimization. Potential benefits are significant
  since both energy dissipation and performance highly depend on the traffic
  between memory levels.
  This thesis will visit most of the steps of high level transformation frameworks
  in the polyhedral model in order to improve both applicability and target code
  quality. To achieve this goal, we refine the concept of static control parts
  (SCoP) and we show experimentally that this program model is relevant to
  real-life applications. We present a transformation policy freed of classical
  limitations like considering only unimodular or invertible functions. Lastly,
  we extend the Quiller\'{e} et al. algorithm to be able to generate efficient codes
  in a reasonable amount of time. To exploit this transformation framework, we
  propose a data locality improvement method based on a singular execution scheme
  where an asymptotic evaluation of the memory traffic is possible. This information
  is used in our optimization algorithm to find the best reordering of the program
  operations, at least in an asymptotic sense. This method considers legality
  and each type of data locality as constraints whose solution is an appropriate
  transformation. The optimizer has been prototyped and tested with non-trivial
  programs. Experimental evidence shows that our framework can improve data
  locality and performance in traditionally challenging programs with e.g.
  non-perfectly nested loops, complex dependences and non-uniformly generated
  references.
 }
}

@InProceedings{Bastoul2004CLooG,
 author = {Bastoul, C\'{e}dric},
 title = {Code Generation in the Polyhedral Model Is Easier Than You Think},
 booktitle = {IEEE International Conference on Parallel Architecture
              and Compilation Techniques},
 year = 2004,
 pages = {7--16},
 month = {September},
 address = {Juan-les-Pins, France},
 abstract = {
  Many advances in automatic parallelization and optimization have been
  achieved through the polyhedral model. It has been extensively shown that this
  computational model provides convenient abstractions to reason about and apply
  program transformations. Nevertheless, the complexity of code generation has
  long been a deterrent for using polyhedral representation in optimizing
  compilers. First, code generators have a hard time coping with generated code
  size and control overhead that may spoil theoretical benefits achieved by the
  transformations. Second, this step is usually time consuming, hampering the
  integration of the polyhedral framework in production compilers or
  feedback-directed, iterative optimization schemes. Moreover, current code
  generation algorithms only cover a restrictive set of possible transformation
  functions. This paper discusses a general transformation framework able to
  deal with non-unimodular, non-invertible, non-integral or even non-uniform
  functions. It presents several improvements to a state-of-the-art code
  generation algorithm. Two directions are explored: generated code size and
  code generator efficiency. Experimental evidence proves the ability of the
  improved method to handle real-life problems.
 }
}

@article{Goumas2003efficient,
  title={An efficient code generation technique for tiled iteration spaces},
  author={Goumas, Georgios and Athanasaki, Maria and Koziris, Nectarios},
  journal={Parallel and Distributed Systems, IEEE Transactions on},
  volume={14},
  number={10},
  pages={1021--1034},
  year={2003},
  publisher={IEEE},
  url={http://ftp.cslab.ece.ntua.gr/~goumas/downloads/tpds2003.pdf},
  abstract={
This paper presents a novel approach for the problem of generating tiled code
for nested for-loops, transformed by a tiling transformation. Tiling or
supernode transformation has been widely used to improve locality in
multi-level memory hierarchies, as well as to efficiently execute loops onto
parallel architectures. However, automatic code generation for tiled loops can
be a very complex compiler work, especially when non-rectangular tile shapes
and iteration space bounds are concerned. Our method considerably enhances
previous work on rewriting tiled loops, by considering parallelepiped tiles and
arbitrary iteration space shapes. In order to generate tiled code, we first
enumerate all tiles containing points within the iteration space and second
sweep all points within each tile. For the first subproblem This paper presents
a novel approach for the problem of generating tiled code for nested for-loops,
transformed by a tiling transformation. Tiling or supernode transformation has
been widely used to improve locality in multi-level memory hierarchies, as well
as to efficiently execute loops onto parallel architectures. However, automatic
code generation for tiled loops can be a very complex compiler work, especially
when non-rectangular tile shapes and iteration space bounds are concerned. Our
method considerably enhances previous work on rewriting tiled loops, by
considering parallelepiped tiles and arbitrary iteration space shapes. In order
to generate tiled code, we first enumerate all tiles containing points within
the iteration space and second sweep all points within each tile. For the first
subproblem
}
}

@inproceedings{Darte2001,
 athor= {Alain Darte and Yves Robert and Fr\'ed\'eric Vivien},
 booktitle = {Compiler Optimizations for Scalable Parallel Systems: Languages, Compilation Techniques and Run Time Systems},
 publisher = {Springer Verlag},
 series = {LNCS},
 volume = {1808},
 title = {Loop parallelization algorithms},
 pages = {141-171},
 year = {2001},
 url = "http://graal.ens-lyon.fr/~fvivien/Publications/Chapter-LNCS.pdf",
 abstract = "
This chapter is devoted to a comparative survey of loop parallelization
algorithms. Various algorithms have been presented in the literature, such as
those introduced by Allen and Kennedy, Wolf and Lam, Darte and Vivien, and
Feautrier. These algorithms make use of different mathematical tools. Also,
they do not rely on the same representation of data dependences. In this
chapter, we survey each of these algorithms, and we assess their power and
limitations, both through examples and by stating “optimality” results. An
important contribution of this chapter is to characterize which algorithm is
the most suitable for a given representation of dependences. This result is of
practical interest, as it provides guidance for a compiler-parallelizer: given
the dependence analysis that is available, the simplest and cheapest
parallelization algorithm that remains optimal should be selected.
"
}

@article{Griebl2000index,
  title={Index set splitting},
  author={Griebl, Martin and Feautrier, Paul and Lengauer, Christian},
  journal={International Journal of Parallel Programming},
  volume={28},
  number={6},
  pages={607--631},
  year={2000},
  publisher={Springer},
  url={http://www.infosun.fim.uni-passau.de/cl/publications/docs/GFL00ijpp.pdf},
  abstract="
There are many algorithms for the space-time mapping of nested loops. Some
of them even make the optimal choices within their framework. We propose a
preprocessing phase for algorithms in the polytope model, which extends the
model and yields space-time mappings whose schedule is, in some cases, orders
of magnitude faster. These are cases in which the dependence graph has small
irregularities. The basic idea is to split the index set of the loop nests into parts
with a regular dependence structure and apply the existing space-time mapping
algorithms to these parts individually. This work is based on a seminal idea in
the more limited context of loop parallelization at the code level. We elevate the
idea to the model level (our model is the polytope model), which increases its
applicability by providing a clearer and wider range of choices at an acceptable
analysis cost. Index set splitting is one facet in the effort to extend the power of
the polytope model and to enable the generation of competitive target code."
}

@article{Quillere2000,
 author = {Quiller\'{e}, Fabien and Rajopadhye, Sanjay and Wilde, Doran},
 title = {Generation of Efficient Nested Loops from Polyhedra},
 journal = {International Journal of Parallel Programming},
 volume = 28,
 number=5,
 month=oct,
 year = 2000,
 pages={469--498},
 abstract = {
  Automatic parallelization in the polyhedral model is based on affine
  transformations from an original computation domain (iteration space) to a
  target space-time domain, often with a different transformation for each
  variable. Code generation is an often ignored step in this process that has a
  significant impact on the quality of the final code. It involves making a
  trade-off between code size and control code simplification/optimization.
  Previous methods of doing code generation are based on loop splitting,
  however they have non-optimal behavior when working on parameterized
  programs. We present a general parameterized method for code generation based
  on dual representation of polyhedra. Our algorithm uses a simple recursion on
  the dimensions of the domains, and enables fine control over the tradeoff
  between code size and control overhead.
 }
}

@misc{Loechner1999polylib,
 title={PolyLib: A library for manipulating parameterized polyhedra},
 author={Loechner, Vincent},
 year={1999},
 url={https://repo.or.cz/polylib.git/blob_plain/HEAD:/doc/parampoly-doc.ps.gz}
}

@article{Kelly1995petit,
  title={New user interface for Petit and other interfaces: user guide},
  author={Kelly, W and Maslov, V and Pugh, W and Rosser, E and Shpeisman, T and Wonnacott, D},
  journal={University of Maryland},
  year={1995}
}

@inproceedings{Collard1995fuzzy,
 title={Fuzzy array dataflow analysis},
 author={Collard, Jean-Fran{\c{c}}ois and Barthou, Denis and Feautrier, Paul},
 booktitle={ACM SIGPLAN Notices},
 volume={30},
 number={8},
 pages={92--101},
 year={1995},
 organization={ACM},
 url={http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.47.187&rep=rep1&type=pdf},
 abstract={
Exact array dataflow analysis can be achieved in the general case if the only
control structures are do-loops and structural ifs, and if loop counter bounds
and array subscripts are affine expressions of englobing loop counters and
possibly some integer constants. In this paper, we begin the study of dataflow
analysis of dynamic control programs, where arbitrary ifs and whiles are
allowed. In the general case, this dataflow analysis can only be fuzzy.
}
}

@book{Catthoor1998custom,
 title={Custom memory management methodology: Exploration of memory organisation for embedded multimedia system design},
 author={Catthoor, Francky and Greef, Eddy de and Suytack, Sven},
 year={1998},
 publisher={Kluwer Academic Publishers},
}

@article{Geigl1997,
  title={Parallelization of loop nests with general bounds in the polyhedron model},
  author={Geigl, Max},
  journal={Master's thesis, Universit at Passau},
  year={1997},
  publisher={Citeseer}
}

@article{Loechner1997parameterized,
  title={Parameterized polyhedra and their vertices},
  author={Loechner, Vincent and Wilde, Doran K},
  journal={International Journal of Parallel Programming},
  volume={25},
  number={6},
  pages={525--549},
  year={1997},
  publisher={Springer},
  abstract={
Algorithms specified for parametrically sized problems are more general
purpose and more reusable than algorithms for fixed sized problems. For this
reason, there is a need for representing and symbolically analyzing linearly
parameterized algorithms. An important class of parallel algorithms can be
described as systems of parameterized affine recurrence equations (PARE). In
this representation, linearly parameterized polyhedra are used to describe the
domains of variables. This paper describes an algorithm which computes the set
of parameterized vertices of a polyhedron, given its representation as a
system of parameterized inequalities. This provides an important tool for the
symbolic analysis of the parameterized domains used to define variables and
computation domains in PAREs. A library of operations on parameterized
polyhedra based on the Polyhedral Library has been written in C and is freely
distributed. 
},
  url={http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.45.2146&rep=rep1&type=pdf}
}

@inproceedings{Griebl1996loopo,
 title={The loop parallelizer LooPo},
 author={Griebl, Martin and Lengauer, Christian},
 booktitle={Proc. Sixth Workshop on Compilers for Parallel Computers},
 volume={21},
 pages={311--320},
 year={1996},
 organization={Citeseer},
 url={http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.30.6700&rep=rep1&type=pdf},
}

@inbook{Feautrier1996automatic,
 author = "Paul Feautrier",
 chapter = "Automatic Parallelization in the Polytope Model",
 title = "The Data Parallel Programming Model",
 year = "1996",
 publisher = "Springer-Verlag",
 volume = "1132",
 series = "LNCS",
 url = {http://www.researchgate.net/publication/2508644_Automatic_Parallelization_in_the_Polytope_Model/file/5046351a4f6b578814.pdf},
 abstract = "
The aim of this paper is to explain the importance of polytope and polyhedra in
automatic parallelization. We show that the semantics of parallel programs is
best described geometrically, as properties of sets of integral points in
n-dimensional spaces, where n is related to the maximum nesting depth of DO
loops. The needed properties translate nicely to properties of polyhedra, for
which many algorithms have been designed for the needs of optimization and
operation research. We show how these ideas apply to scheduling, placement and
parallel code generation.
"
}

@article{Kelly1996omega,
 title = {The {O}mega Calculator and Library, Version 1.1.0},
 author = {Kelly, Wayne and Maslov, Vadim and Pugh, William and Rosser, Evan and Shpeisman, Tatiana and Wonnacott, Dave},
 year = 1996,
 url={http://www.cs.utah.edu/~mhall/cs6963s09/lectures/omega.ps}
}

@phdthesis{Kelly96optimization,
 author = "Wayne Kelly",
 title = "Optimization within a Unified Transformation Framework",
 number = "CS-TR-3725",
 institution =   "Dept. of CS, Univ. of Maryland, College Park",
 year = "1996",
 url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.8219&rep=rep1&type=pdf"
}

@article{Creusillet1996,
 author = {Creusillet, B\'{e}atrice and Irigoin, Francois},
 title = {Interprocedural array region analyses},
 journal = {International Journal of Parallel Programming},
 volume = {24},
 issue = {6},
 month = {December},
 year = {1996},
 issn = {0885-7458},
 url = "http://link.springer.com/chapter/10.1007/BFb0014191",
 abstract = "
Many program optimizations require exact knowledge of the sets of array
elements that are referenced in or that flow between statements or procedures.
Some examples are array privatization, generation of communications in
distributed memory machines, or compile-time optimization of cache behavior in
hierarchical memory machines.

Exact array region analysis is introduced in this article. These regions
exactly represent the effects of statements and procedures upon array
variables. To represent the flow of these data, we also introduce two new types
of array region analyses: IN and OUT regions.

The intraprocedural propagation is presented, as well as a general linear
framework for interprocedural analyses, that handles array reshapes.
"
}

@inproceedings{Clauss1996,
  author =	 {Philippe Clauss},
  title =	 {Counting Solutions to Linear and Nonlinear Constraints
                  Through {E}hrhart Polynomials: Applications to Analyze and
                  Transform Scientific Programs},
  booktitle =	 {International Conference on Supercomputing (ICS'96)},
  year =	 1996,
  pages =	 {278-285},
}

@inproceedings{Amarasinghe1995overview,
 title={An Overview of the SUIF Compiler for Scalable Parallel Machines.},
 author={Amarasinghe, Saman P and Anderson, Jennifer-Ann M and Lam, Monica S and Tseng, Chau-Wen},
 booktitle={PPSC},
 pages={662--667},
 year={1995},
 url={http://www.researchgate.net/publication/2646829_Chapter_1_An_Overview_of_the_SUIF_Compiler_for_Scalable_Parallel_Machines/file/60b7d5193bc4fef52b.pdf},
 abstract={
We are building a compiler that automatically translates sequential scientic
programs into parallel code for scalable parallel machines. Many of the
compiler techniques needed to generate correct and effcient code are common
across all scalable machines, regardless of whether its address space is shared
or distributed. This paper describes the structure of the compiler, emphasizing
the common analyses and optimizations. We focus on the three major phases of
the compiler: parallelism and locality analysis, communication and
synchronization analysis, and code generation.
}
}

@inproceedings{Kelly1995code,
 author = "Kelly, Wayne and Pugh, William and Rosser, Evan",
 title = "Code Generation for Multiple Mappings",
 booktitle = "The 5th Symposium on the Frontiers of Massively Parallel Computation",
 address = "McLean, VA",
 year = 1995,
 url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.23.8696&rep=rep1&type=pdf",
 abstract = "
There has been a great amount of recent work toward unifying iteration
reordering transformations. Many of these approaches represent transformations
as affine mappings from the original iteration space to a new iteration space.
These approaches show a great deal of promise, but they all rely on the ability
to generate code that iterates over the points in these new iteration spaces in
the appropriate order. This problem has been fairly well-studied in the case
where all statements use the same mapping. We have developed an algorithm for
the less well-studied case where each statement uses a potentially different
mapping. Unlike many other approaches, our algorithm can also generate code
from mappings corresponding to loop blocking. We address the important
trade-off between reducing control overhead and duplicating code.
"
}

@conference{Kelly1995unifying,
 author = {Kelly, Wayne and Pugh, William},
 title = {A unifying framework for iteration reordering transformations},
 booktitle = {IEEE First Int. Conf. on Algorithms and Architectures for Parallel Processing (ICAPP 95)},
 volume = {1},
 year = {1995},
 month = apr,
 url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.1382&rep=rep1&type=ps",
 abstract = "
We present a framework for unifying iteration reordering transformations such
as loop interchange, loop distribution, skewing, tiling, index set splitting
and statement reordering. The framework is based on the idea that a
transformation can be represented as a mapping from the original iteration
space to a new iteration space. The framework is designed to provide a uniform
way to represent and reason about transformations. We also provide algorithms
to test the legality of mappings, and to generate optimized code for mappings.
"
}

@article{Pugh1994static,
  title={Static analysis of upper and lower bounds on dependences and parallelism},
  author={Pugh, William and Wonnacott, David},
  journal={ACM Transactions on Programming Languages and Systems (TOPLAS)},
  volume={16},
  number={4},
  pages={1248--1278},
  year={1994},
  publisher={ACM},
  abstract = {
Existing compilers often fail to parallelize sequential code, even when a
program can be manually transformed into parallel form by a sequence of
well-understood transformations (as in the case for many of the Perfect Club
Benchmark programs). These failures can occur for several reasons: the code
transformations implemented in the compiler may not be sufficient to produce
parallel code, the compiler may not find the proper sequence of
transformations, or the compiler may not be able to prove that one of the
necessary transformations is legal. When a compiler fails to extract sufficient
parallelism from a program, the programmer may try to extract additional
parallelism. Unfortunately, the programmer is typically left to search for
parallelism without significant assistance. The compiler generally does not
give feedback about which parts of the program might contain additional
parallelism, or about the types of transformations that might be needed to
realize this parallelism. Standard program transformations and dependence
abstractions cannot be used to provide this feedback. In this paper, we propose
a two-step approach to the search for parallelism in sequential programs. In
the first step, we construct several sets of constraints that describe, for
each statement, which iterations of that statement can be executed
concurrently. By constructing constraints that correspond to different
assumptions about which dependences might be eliminated through additional
analysis, transformations, and user assertions, we can determine whether we can
expose parallelism by eliminating dependences. In the second step of our search
for parallelism, we examine these constraint sets to identify the kinds of
transformations needed to exploit scalable parallelism. Our tests will identify
conditional parallelism and parallelism that can be exposed by combinations of
transformations that reorder the iteration space (such as loop interchange and
loop peeling). This approach lets us distinguish inherently sequential code
from code that contains unexploited parallelism. It also produces information
about the kinds of transformations needed to parallelize the code, without
worrying about the order of application of the transformations. Furthermore,
when our dependence test is inexact we can identify which unresolved
dependences inhibit parallelism by comparing the effects of assuming dependence
or independence. We are currently exploring the use of this information in
programmer-assisted parallelization.
},
  url={http://drum.lib.umd.edu/bitstream/1903/629/4/CS-TR-3250.pdf},
}

@book{pugh1994exact,
  title={An exact method for analysis of value-based array data dependences},
  author={Pugh, William and Wonnacott, David},
  year={1994},
  publisher={Springer},
  abstract={
Standard array data dependence testing algorithms give information
about the aliasing of array references. If statement 1 writes a[5],
and statement 2 later reads a [5], standard techniques described this
as a flow dependence, even if there was an intervening write. We call
a dependence between two references to the same memory location a
memory-based dependence. In contrast, if there are no intervening
writes, the references touch the same value and we call the
dependence a value-based dependence.

There has been a surge of recent work on value-based array data
dependence analysis (also referred to as computation of array
data-flow dependence information). In this paper, we describe a
technique that is exact over programs without control flow (other
than loops) and non-linear references. We compare our proposal with
the technique proposed by Paul Feautrier, which is the other
technique that is complete over the same domain as ours. We also
compare our work with that of Tu and Padua, a representative
approximate scheme for array privatization.
},
  url={http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.22.8688&rep=rep1&type=pdf},
}

@inproceedings{Maslov1994lazy,
 author = {Maslov, Vadim},
 booktitle = {POPL},
 editor = {Boehm, Hans-Juergen and Lang, Bernard and Yellin, Daniel M.},
 isbn = {0-89791-636-0},
 publisher = {ACM Press},
 title = {Lazy Array Data-Flow Dependence Analysis},
 year = 1994,
 url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.37.9427&rep=rep1&type=pdf},
 abstract = "
Automatic parallelization of real FORTRAN programs does not live up to users
expectations yet, and dependence analysis algorithms which either produce too
many false dependences or are too slow to contribute significantly to this. In
this paper we introduce dataflow dependence analysis algorithm which exactly
computes value-based dependence relations for program fragments in which all
subscripts, loop bound and IF conditions are affine. Our algorithm also
computes good affine approximations of dependence relations for non-affine
program fragments. Actually, we do not know about any other algorithm which can
compute better approximations. And our algorithm is efficient too, because it
is lazy. When searching for write statements that supply values used by a given
read statement, it starts with statements which are lexicographically close to
the read statement in iteration space. Then if some of the read statement
instances are not “satisfied” with these close writes, the algorithm broadens
its search scope by looking into more distant writes. The search scope keeps
broadening until all read instances are satisfied or no write candidates are
left. We timed our algorithm on several benchmark programs and the timing
results suggest that our algorithm is fast enough to be used in commercial
compilers—it usually takes 5 to 15 percent of f77 -02 compilation time to
analyze a program. Most programs in the 100-line range take less than 1 second
to analyze on a SUN SparcStation IPX.
"
}


@TechReport{Wilde1993polylib,
  author =	 {Doran K. Wilde},
  title =	 {A Library for Doing Polyhedral Operations},
  institution =	 {IRISA},
  year =	 1993,
  number =	 785,
  month =	 dec
}

@article{Feautrier1992multi,
 author = {Feautrier, Paul},
 affiliation = {Laboratoire MASI Université de Versailles St-Quentin 45 Avenue
                des Etats-Unis 78035 Versailles Cedex France},
 title = {Some efficient solutions to the affine scheduling problem. Part II.
          Multidimensional time},
 journal = {International Journal of Parallel Programming},
 publisher = {Springer Netherlands},
 issn = {0885-7458},
 keyword = {Informatique},
 pages = {389-420},
 volume = {21},
 issue = {6},
 url = {http://dx.doi.org/10.1007/BF01379404},
 note = {10.1007/BF01379404},
 year = {1992},
 abstract = {
  This paper extends the algorithms which were given in Part I to cases in which
  there is no affine schedule, i.e. to problems whose parallel complexity is
  polynomial but not linear. The natural generalization is to multidimensional
  schedules with lexicographic ordering as temporal succession. Multidimensional
  affine schedules, are, in a sense, equivalent to polynomial schedules, and are
  much easier to handle automatically. Furthermore, there is a strong connexion
  between multidimensional schedules and loop nests, which allows one to prove
  that a static control program always has a multidimensional schedule. Roughly,
  a larger dimension indicates less parallelism. In the algorithm which is
  presented here, this dimension is computed dynamically, and is just sufficient
  for scheduling the source program. The algorithm lends itself to a "divide and
  conquer" strategy. The paper gives some experimental evidence for the
  applicability, performances and limitations of the algorithm.
 }
}

@article{Feautrier1992one,
 author = {Feautrier, Paul},
 title = {Some efficient solutions to the affine scheduling problem: Part I.
          One-dimensional time},
 journal = {International Journal of Parallel Programming},
 issue_date = {Oct. 1992},
 volume = {21},
 number = {5},
 month = oct,
 year = {1992},
 issn = {0885-7458},
 pages = {313--348},
 numpages = {36},
 url = {http://dx.doi.org/10.1007/BF01407835},
 doi = {10.1007/BF01407835},
 acmid = {171448},
 publisher = {Kluwer Academic Publishers},
 address = {Norwell, MA, USA},
 keywords = {automatic parallelization, automatic systolic array design,
             scheduling},
 abstract = {
  Programs and systems of recurrence equations may be represented as
  sets of actions which are to be executed subject to precedence constraints. In
  many cases, actions may be labelled by integral vectors in some iteration
  domain, and precedence constraints may be described by affine relations. A
  schedule for such a program is a function which allows one to estimate the
  intrinsic degree of parallelism of the program and to compile a parallel
  version for multiprocessor architectures or systolic arrays. This paper deals
  with the problem of finding closed form schedules as affine or piecewise
  affine functions of the iteration vector. An efficient algorithm which
  reduces the scheduling problem to a parametric linear program of small size,
  which can be readily solved by an efficient algorithm.
 }
}

@inproceedings{Pugh1991uniform,
 title={Uniform Techniques for Loop Optimization},
 author={Pugh, William},
 booktitle={5th International Conference on Supercomputing (ICS'91)},
 pages={341--352},
 year={1991},
 organization={ACM},
}

@article{Feautrier1991,
 title = {Dataflow Analysis of Array and Scalar References},
 author = {Feautrier, Paul},
 journal = {International Journal of Parallel Programming},
 volume = {20},
 number = {1},
 pages = {23--53},
 year = {1991},
 publisher = {Springer},
 abstract = {
  Given a program written in a simple imperative language (assignment
  statements,for loops, affine indices and loop limits), this paper presents an
  algorithm for analyzing the patterns along which values flow as the execution
  proceeds. For each array or scalar reference, the result is the name and
  iteration vector of the source statement as a function of the iteration
  vector of the referencing statement. The paper discusses several applications
  of the method: conversion of a program to a set of recurrence equations,
  array and scalar expansion, program verification and parallel program
  construction.
 }
}

@TECHREPORT{Schreiber90automaticblocking,
 author = {Robert Schreiber and Jack J. Dongarra},
 title = {Automatic blocking of nested loops},
 institution = {},
 year = {1990},
 url = {http://www.netlib.org/utk/papers/autoblock/paper.ps},
}

@InProceedings{Irigoin1988,
  author =	 {Irigoin, Francois and Triolet, Remi},
  title =	 {Supernode Partitioning},
  booktitle =	 {Symposium on Principles of Programming Languages (POPL'88)},
  pages =	 {319--328},
  year =	 1988,
  address =	 {San Diego, CA},
  month =	 jan,
  url =		 {http://ssh.cri.ensmp.fr/classement/doc/A-179.pdf},
  abstract =	 " Supercompilers must reschedule computations defined by
                  nested DO-loops in order to make an efficient use of
                  supercomputer features (vector units, multiple elementary
                  processors, cache memory, etc…). Many rescheduling
                  techniques like loop interchange, loop strip-mining or
                  rectangular partitioning have been described to speedup
                  program execution. We present here a class of partitionings
                  that encompasses previous techniques and provides enough
                  flexibility to adapt code to multiprocessors with two levels
                  of parallelism and two levels of memory. "
}

@inproceedings{Ancourt1991scanning,
  title={Scanning polyhedra with DO loops},
  author={Ancourt, Corinne and Irigoin, Fran{\c{c}}ois},
  booktitle={ACM Sigplan Notices},
  volume={26},
  number={7},
  pages={39--50},
  year={1991},
  organization={ACM},
  abstract={
Supercompilers perform complex program transformations which often result in
new loop bounds. This paper shows that, under the usual assumptions in
automatic parallelization, most transformations on loop nests can be expressed
as affine transformations on integer sets defined by polyhedra and that the new
loop bounds can be computed with algorithms using Fourier's pairwise
elimination method although it is not exact for integer sets. Sufficient
conditions to use pairwise elimination on integer sets and to extend it to
pseudo-linear constraints are also given. A tradeoff has to be made between
dynamic overhead due to some bound slackness and compilation complexity but the
resulting code is always correct. Thse algorithms can be used to onterchange or
block loops regardless of the loop bounds or the blocking strategy and to
safety exchange array parts between two levels of a memory hierarchy or between
neighboring processors in a distributed memory machine.
},
  url={http://hal.inria.fr/docs/00/75/27/74/PDF/A-195.pdf}
}

@article{Feautrier1988parametric,
  title={Parametric Integer Programming},
  author={Feautrier, Paul},
  journal={RAIRO Recherche op{\'e}rationnelle},
  volume={22},
  number={3},
  pages={243--268},
  year={1988},
  abstract={
When analysising computer programs (especially numercial programs in which
arrays are used extensively), one is often confronted with integer programing
problems. These problems have three peculiarities:
  1) feasible points are ranked according to lexicographic order rathern than
the usual linear economic function;
  2) the feasible set depends on integer parameters;
  3) one is interested only in exact solutions.

The difficulty is somewhat alleviated by the fact that problems sizes are usually quite small. In this paper
we show thath:
  1) the classical simplex algorithm has no difficulty in handling lexicographic ordering;
  2) the algorithm may be executed in symbolic mode, this giving the solution of continuous parametric problems;
  3) the method may be extended to problems in integers.

We prove that the resulting algorithm always terminate and give an estimative of its complexity.
},
}

@inproceedings{Feautrier1988array,
  title={Array Expansion},
  author={Feautrier, Paul},
  booktitle={2nd International Conference on Supercomputing (ICS'88)},
  pages={429--441},
  year={1988},
  organization={ACM},
  abstract={
A common problem in restructuring programs for vector or parallel execution is
the suppression of false dependencies which originate in the reuse of the same
memory cell for unrelated values. The method is simple and well understood in
the case of scalars. This paper gives the general solution for the case of
arrays. The expansion is done in two steps: first, modify all definitions of
the offending array in order to obtain the single assignment property. Then,
reconstruct the original data flow by adapting all uses of the array. This is
done with the help of a new algorithm for solving parametric integer programs.
The technique is quite general and may be used for other purposes, including
program checking, collecting array predicates, etc ...
},
  url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.29.5704&rep=rep1&type=pdf}
}

@inproceedings{Cousot1978,
  author =	 {Cousot, Patrick and Halbwachs, Nicolas},
  title =	 {Automatic Discovery of Linear Restraints Among Variables of a
                  Program},
  booktitle =	 {5th ACM SIGACT-SIGPLAN Symposium on Principles of Programming
                  Languages (POPL'78)},
  year =	 1978,
  address =	 {Tucson, Arizona},
  pages =	 {84--96},
  doi =		 {10.1145/512760.512770},
  publisher =	 {ACM},
} 

@article{Lamport1974,
  title =	 {The Parallel Execution of {DO} Loops},
  author =	 {Lamport, Leslie},
  journal =	 {Communications of the ACM},
  volume =	 17,
  number =	 2,
  pages =	 {83--93},
  year =	 1974,
  publisher =	 {ACM},
  url =
                  {http://research.microsoft.com/en-us/um/people/lamport/pubs/do-loops.pdf},
  abstract =	 { Methods are developed for the parallel execution of
                  different iterations of a DO loop. Both asynchronous
                  multiprocessor computers and array computers are
                  considered. Practical application to the design of compilers
                  for such computers is discussed. }
}

@article{Karp1967,
 title = {The Organization of Computations for Uniform Recurrence Equations},
 author = {Karp, Richard M and Miller, Raymond E and Winograd, Shmuel},
 journal = {Journal of the ACM},
 volume = {14},
 number = {3},
 pages = {563--590},
 year = {1967},
 publisher = {ACM},
 url = {http://dl.acm.org/citation.cfm?id=321418},
 abstract = {
  A set equations in the quantities ai(p), where i = 1, 2, · · ·, m and p ranges
  over a set R of lattice points in n-space, is called a system of uniform
  recurrence equations if the following property holds: If p and q are in R and
  w is an integer n-vector, then ai(p) depends directly on aj(p - w) if and
  only if ai(q) depends directly on aj(q - w). Finite-difference approximations
  to systems of partial differential equations typically lead to such recurrence
  equations. The structure of such a system is specified by a dependence graph G
  having m vertices, in which the directed edges are labeled with integer
  n-vectors. For certain choices of the set R, necessary and sufficient
  conditions on G are given for the existence of a schedule to compute all the
  quantities ai(p) explicitly from their defining equations. Properties of such
  schedules, such as the degree to which computation can proceed “in parallel,”
  are characterized. These characterizations depend on a certain iterative
  decomposition of a dependence graph into subgraphs. Analogous results
  concerning implicit schedules are also given.
 }
}

@article{Beletska:2011:CLP:2010586.2010611,
 author = {Beletska, Anna and Bielecki, Wlodzimierz and Cohen, Albert and Palkowski, Marek and Siedlecki, Krzysztof},
 title = {Coarse-grained Loop Parallelization: Iteration Space Slicing vs Affine Transformations},
 journal = {Parallel Comput.},
 issue_date = {August, 2011},
 volume = {37},
 number = {8},
 month = aug,
 year = {2011},
 issn = {0167-8191},
 pages = {479--497},
 numpages = {19},
 url = {http://dx.doi.org/10.1016/j.parco.2010.12.005},
 doi = {10.1016/j.parco.2010.12.005},
 acmid = {2010611},
 publisher = {Elsevier Science Publishers B. V.},
 address = {Amsterdam, The Netherlands, The Netherlands},
 keywords = {Affine transformations, Coarse-grained parallelism, Iteration Space Slicing},
} 

@article{Bielecki2012518,
title = "Free scheduling for statement instances of parameterized arbitrarily nested affine loops ",
journal = "Parallel Computing ",
volume = "38",
number = "9",
pages = "518 - 532",
year = "2012",
note = "",
issn = "0167-8191",
doi = "https://doi.org/10.1016/j.parco.2012.06.001",
url = "http://www.sciencedirect.com/science/article/pii/S0167819112000336",
author = "Wlodzimierz Bielecki and Marek Palkowski and Tomasz Klimek",
keywords = "Affine loops",
keywords = "Free scheduling",
keywords = "Fine-grained parallelism",
keywords = "Transitive closure ",
abstract = "An approach is presented permitting us to build free scheduling for statement instances of affine loops. Under the free schedule, loop statement instances are executed as soon as their operands are available. This allows us to extract maximal fine-grained loop parallelism and minimize the number of synchronization events. The approach is based on calculating the power k of a relation representing exactly all dependences in a loop. In general, such a relation is a union of simpler relations. When there are troubles with calculating free scheduling due to the large number of simpler dependence relations, another technique is discussed allowing for extracting free scheduling in an iteration subspace defined by indices of inner nests of this loop. We demonstrate that if we are able to calculate the power k of a dependence relation describing all dependences in the loop, then we are able also to produce free scheduling. Experimental results exposing the effectiveness, efficiency, and time complexity of the algorithms are outlined. Problems to be resolved in the future to utilize the entire power of the presented techniques are discussed. "
}

@Article{Bielecki2016,
  Title                    = {Tiling of arbitrarily nested loops by means of the transitive closure of dependence graphs},
  Author                   = {Wlodzimierz Bielecki and Marek Palkowski},
  Journal                  = {International Journal of Applied Mathematics and Computer Science (AMCS)},
  Year                     = {2016},
  Month                    = {December},
  Number                   = {4},
  Volume                   = {Vol. 26}
  }
  

@Article{Palkowski2017,
  Title                    = {Parallel tiled Nussinov RNA folding loop nest generated using both dependence graph transitive closure and loop skewing},
  Author                   = {Marek Palkowski and Wlodzimierz Bielecki},
  Journal                  = {BMC Bioinformatics},
  Year                     = {2017},

  Month                    = {June},
  Volume                   = {18:290},

  Doi                      = {10.1186/s12859-017-1707-8},
  Owner                    = {marek},
  Timestamp                = {2017.06.07},
  Url                      = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-017-1707-8}
}

@Article{TRACOCOMP2017,
author     =   "Bielecki, Wlodzimierz and Palkowski, Marek and 
                Skotnicki, Piotr",
title      =   "Generation of parallel synchronization-free tiled code",
journal    =   "Computing",
year       =   "2017",
month      =   "Oct",
day        =   "20",
issn       =   "1436-5057",
doi        =   "10.1007/s00607-017-0576-3",
}

@Article{Palkowski2018,
author="Palkowski, Marek
and Bielecki, Wlodzimierz",
title="Tuning iteration space slicing based tiled multi-core code implementing Nussinov's RNA folding",
journal="BMC Bioinformatics",
year="2018",
month="Jan",
day="15",
volume="19",
number="1",
pages="12",
abstract="RNA folding is an ongoing compute-intensive task of bioinformatics. Parallelization and improving code locality for this kind of algorithms is one of the most relevant areas in computational biology. Fortunately, RNA secondary structure approaches, such as Nussinov's recurrence, involve mathematical operations over affine control loops whose iteration space can be represented by the polyhedral model. This allows us to apply powerful polyhedral compilation techniques based on the transitive closure of dependence graphs to generate parallel tiled code implementing Nussinov's RNA folding. Such techniques are within the iteration space slicing framework -- the transitive dependences are applied to the statement instances of interest to produce valid tiles. The main problem at generating parallel tiled code is defining a proper tile size and tile dimension which impact parallelism degree and code locality.",
issn="1471-2105",
doi="10.1186/s12859-018-2008-6",
url="https://doi.org/10.1186/s12859-018-2008-6"
}

@Article{Palkowski2018jcb,
author="Palkowski, Marek
and Bielecki, Wlodzimierz",
title="Parallel Tiled Codes Implementing the Smith–Waterman Alignment Algorithm for Two and Three Sequences",
journal="Journal of Computational Biology",
year="2018",
month="Jul",
day="11",
volume="Fast track",
abstract="The Smith–Waterman (SW) algorithm explores all the possible alignments between two or more sequences and as a result it returns the optimal local alignment. However, the computational cost of this algorithm is very high, and the exponential growth of computation makes SW unrealistic for searching similarities in large sets of sequences. Fortunately, the dynamic programming kernel of the SW algorithm involves mathematical operations over affine control loops whose iteration space can be represented by the polyhedral model. This allows us to apply polyhedral compilation techniques to optimize the studied SW dense array code. In this article, we present an approach to generate efficient SW implementations for two and three sequences by using the transitive closure of a dependence graph and loop skewing. Generated programs are represented with parallel tiled loop nests, which expose significantly higher performance than that of programs obtained with closely related compilers. The approach is able to tile all loops of original loop nests as opposed to well-known affine transformation techniques. Furthermore, it allows for code optimization of three-sequence alignment. Such a code cannot be generated by means of state-of-the-art automatic optimizing compilers. We demonstrate that an under-approximation of transitive closure (instead of exact transitive closure) can be used to generate valid parallel tiled code. This considerably reduces the computational complexity of the approach. Generated codes were run on cores of a modern Intel multiprocessor and they expose high speedup and good scalability on this platform.",
doi="https://doi.org/10.1089/cmb.2018.0006",
url="https://www.liebertpub.com/doi/10.1089/cmb.2018.0006"
}