Benchmarks containing static control parts.
Tools and libraries to translate a polyhedral representation into source code.
Tools and libraries to translate a high level language into a polyhedral representation.
Libraries to perform calculations on polyhedra.
@INPROCEEDINGS{ALIAS2021DPN,
TITLE = {Data-aware process networks},
AUTHOR = {Alias, Christophe and Plesco, Alexandru},
BOOKTITLE = {Proceedings of the 30th ACM SIGPLAN International Conference on Compiler Construction},
PAGES = {1--11},
YEAR = {2021},
KEYWORDS = {High-Level Synthesis, FPGA, Automatic Parallelization, Polyhedral Model}
}
@ARTICLE{BIELECKI2016,
TITLE = {Tiling of arbitrarily nested loops by means of the transitive closure of dependence graphs},
AUTHOR = {Wlodzimierz Bielecki and Marek Palkowski},
JOURNAL = {International Journal of Applied Mathematics and Computer Science (AMCS)},
YEAR = {2016},
MONTH = {December},
NUMBER = {4},
VOLUME = {Vol. 26}
}
We propose a framework based on an original generation and use of algorithmic skeletons, and dedicated to speculative parallelization of scientific nested loop kernels, able to apply at run-time polyhedral transformations to the target code in order to exhibit parallelism and data locality. Parallel code generation is achieved almost at no cost by using binary algorithmic skeletons that are generated at compile-time, and that embed the original code and operations devoted to instantiate a polyhedral parallelizing transformation and to verify the speculations on dependences. The skeletons are patched at run-time to generate the executable code. The run-time process includes a transformation selection guided by online profiling phases on short samples, using an instrumented version of the code. During this phase, the accessed memory addresses are used to compute on-the-fly dependence distance vectors, and are also interpolated to build a predictor of the forthcoming accesses. Interpolating functions and distance vectors are then employed for dependence analysis to select a parallelizing transformation that, if the prediction is correct, does not induce any rollback during execution. In order to ensure that the rollback time overhead stays low, the code is executed in successive slices of the outermost original loop of the nest. Each slice can be either a parallel version which instantiates a skeleton, a sequential original version, or an instrumented version. Moreover, such slicing of the execution provides the opportunity of transforming differently the code to adapt to the observed execution phases, by patching differently one of the pre-built skeletons. The framework has been implemented with extensions of the LLVM compiler and an x86-64 runtime system. Significant speed-ups are shown on a set of benchmarks that could not have been handled efficiently by a compiler.
@ARTICLE{JIMBOREAN2014SPECULATIVE,
AUTHOR = {Jimborean, Alexandra
and Clauss, Philippe
and Dollinger, Jean-Fran{\c{c}}ois
and Loechner, Vincent
and Martinez Caama{\~{n}}o, Juan Manuel},
TITLE = {Dynamic and Speculative Polyhedral Parallelization Using Compiler-Generated Skeletons},
JOURNAL = {International Journal of Parallel Programming},
YEAR = {2014},
VOLUME = {42},
NUMBER = {4},
PAGES = {529--545},
ABSTRACT = {
We propose a framework based on an original generation and use of algorithmic skeletons, and dedicated to speculative parallelization of scientific nested loop kernels, able to apply at run-time polyhedral transformations to the target code in order to exhibit parallelism and data locality. Parallel code generation is achieved almost at no cost by using binary algorithmic skeletons that are generated at compile-time, and that embed the original code and operations devoted to instantiate a polyhedral parallelizing transformation and to verify the speculations on dependences. The skeletons are patched at run-time to generate the executable code. The run-time process includes a transformation selection guided by online profiling phases on short samples, using an instrumented version of the code. During this phase, the accessed memory addresses are used to compute on-the-fly dependence distance vectors, and are also interpolated to build a predictor of the forthcoming accesses. Interpolating functions and distance vectors are then employed for dependence analysis to select a parallelizing transformation that, if the prediction is correct, does not induce any rollback during execution. In order to ensure that the rollback time overhead stays low, the code is executed in successive slices of the outermost original loop of the nest. Each slice can be either a parallel version which instantiates a skeleton, a sequential original version, or an instrumented version. Moreover, such slicing of the execution provides the opportunity of transforming differently the code to adapt to the observed execution phases, by patching differently one of the pre-built skeletons. The framework has been implemented with extensions of the LLVM compiler and an x86-64 runtime system. Significant speed-ups are shown on a set of benchmarks that could not have been handled efficiently by a compiler.},
ISSN = {1573-7640},
DOI = {10.1007/s10766-013-0259-4},
URL = {http://dx.doi.org/10.1007/s10766-013-0259-4}
}
@ARTICLE{VERDOOLAEGE2013PPCG,
TITLE = {Polyhedral parallel code generation for {CUDA}},
AUTHOR = {Verdoolaege, Sven and Juega, Juan Carlos and Cohen, Albert and
G\'{o}mez, Jos{\'e} Ignacio and Tenllado, Christian and
Catthoor, Francky},
JOURNAL = {ACM Transactions on Architecture and Code Optimization},
ISSUE_DATE = {January 2013},
VOLUME = {9},
NUMBER = {4},
MONTH = {January},
YEAR = {2013},
ISSN = {1544-3566},
PAGES = {54:1--54:23},
DOI = {10.1145/2400682.2400713},
ACMID = {2400713},
PUBLISHER = {ACM},
ADDRESS = {New York, NY, USA}
}
This paper presents a new polyhedra scanning system called CodeGen+ to address the challenge of generating high-performance code for complex iteration spaces resulting from compiler optimization and autotuning systems. The strength of our approach lies in two new algorithms. First, a loop overhead removal algorithm provides precise control of trade-offs between loop overhead and code size based on actual loop nesting depth. Second, an if-statement simplification algorithm further reduces the number of comparisons in the code. These algorithms combined with the expressive power of Presburger arithmetic enable CodeGen+ to support complex optimization strategies expressed in iteration spaces. We compare with the state-of-the-art polyhedra scanning tool CLooG on five loop nest computations, demonstrating that CodeGen+ generates code that is simpler and up to 1.15x faster.
@INPROCEEDINGS{CHEN2012,
AUTHOR = {Chen, Chun},
TITLE = {Polyhedra scanning revisited},
BOOKTITLE = {Conference on Programming Language Design and Implementation},
YEAR = {2012},
PAGES = {499--508},
NUMPAGES = {10},
PUBLISHER = {ACM},
ADDRESS = {New York, NY, USA},
KEYWORDS = {polyhedra scanning, polyhedral transformations},
URL = {http://ctop.cs.utah.edu/downloads/pldi128-chen.pdf},
ABSTRACT = {
This paper presents a new polyhedra scanning system called CodeGen+ to
address the challenge of generating high-performance code for complex iteration
spaces resulting from compiler optimization and autotuning systems. The
strength of our approach lies in two new algorithms. First, a loop overhead
removal algorithm provides precise control of trade-offs between loop overhead
and code size based on actual loop nesting depth. Second, an if-statement
simplification algorithm further reduces the number of comparisons in the code.
These algorithms combined with the expressive power of Presburger arithmetic
enable CodeGen+ to support complex optimization strategies expressed in
iteration spaces. We compare with the state-of-the-art polyhedra scanning tool
CLooG on five loop nest computations, demonstrating that CodeGen+ generates
code that is simpler and up to 1.15x faster.
}
}
The polyhedral model for loop parallelization has proved to be an effective tool for advanced optimization and automatic parallelization of programs in higher-level languages. Yet, to integrate such optimizations seamlessly into production compilers, they must be performed on the compiler's internal, low-level, intermediate representation (IR). With Polly, we present an infrastructure for polyhedral optimizations on such an IR. We describe the detection of program parts amenable to a polyhedral optimization (so-called static control parts), their translation to a Z-polyhedral representation, optimizations on this representation and the generation of optimized IR code. Furthermore, we define an interface for connecting external optimizers and present a novel way of using the parallelism they introduce to generate SIMD and OpenMP code. To evaluate Polly, we compile the PolyBench 2.0 benchmarks fully automatically with PLuTo as external optimizer and parallelizer. We can report on significant speedups.
@ARTICLE{GROSSER2012POLLY,
TITLE = {Polly -- Performing polyhedral optimizations on a low-level intermediate representation},
AUTHOR = {Grosser, Tobias and Gr{\"o}{\ss}linger, Armin and Lengauer, Christian},
JOURNAL = {Parallel Processing Letters},
VOLUME = {22},
NUMBER = {04},
YEAR = {2012},
PUBLISHER = {World Scientific},
ABSTRACT = {
The polyhedral model for loop parallelization has proved to be an effective
tool for advanced optimization and automatic parallelization of programs in
higher-level languages. Yet, to integrate such optimizations seamlessly into
production compilers, they must be performed on the compiler's internal,
low-level, intermediate representation (IR). With Polly, we present an
infrastructure for polyhedral optimizations on such an IR. We describe the
detection of program parts amenable to a polyhedral optimization (so-called
static control parts), their translation to a Z-polyhedral representation,
optimizations on this representation and the generation of optimized IR code.
Furthermore, we define an interface for connecting external optimizers and
present a novel way of using the parallelism they introduce to generate SIMD
and OpenMP code. To evaluate Polly, we compile the PolyBench 2.0 benchmarks
fully automatically with PLuTo as external optimizer and parallelizer. We can
report on significant speedups.
},
URL = {http://www.worldscientific.com/doi/abs/10.1142/S0129626412500107}
}
We present a new library for extracting a polyhedral model from C source. The library is based on clang, the LLVM C frontend, and isl, a library for manipulating quasi-affine sets and relations. The use of clang for parsing the C code brings advanced diagnostics and full support for C99. The use of isl allows for an easy construction and a powerful and compact representation of the polyhedral model. Besides allowing arbitrary piecewise quasi-affine index expressions and conditions, the library also supports some data dependent constructs and has special treatment for unsigned integers. The library has been successfully used to obtain polyhedral models for use in an equivalence checker, a tool for constructing polyhedral process networks, a parallelizer targeting GPUs and an interactive polyhedral environment.
@INPROCEEDINGS{VERDOOLAEGE2012,
AUTHOR = {Verdoolaege, Sven and Grosser, Tobias },
TITLE = {Polyhedral Extraction Tool},
BOOKTITLE = {Second International Workshop on Polyhedral Compilation Techniques (IMPACT'12)},
ADDRESS = {Paris, France},
MONTH = {January},
YEAR = {2012},
URL = {http://impact.gforge.inria.fr/impact2012/workshop_IMPACT/verdoolaege.pdf},
ABSTRACT = {
We present a new library for extracting a polyhedral model from C source. The
library is based on clang, the LLVM C frontend, and isl, a library for
manipulating quasi-affine sets and relations. The use of clang for parsing the
C code brings advanced diagnostics and full support for C99. The use of isl
allows for an easy construction and a powerful and compact representation of
the polyhedral model. Besides allowing arbitrary piecewise quasi-affine index
expressions and conditions, the library also supports some data dependent
constructs and has special treatment for unsigned integers. The library has
been successfully used to obtain polyhedral models for use in an equivalence
checker, a tool for constructing polyhedral process networks, a parallelizer
targeting GPUs and an interactive polyhedral environment.
}
}
The polyhedral model is now a well established and effective formalism for program optimization and parallelization. However, finding optimal transformations is a long-standing open problem. It is therefore important to develop tools that, rather than following predefined optimization criteria, allow practitioners to explore different choices through script-driven or user-guided transformations. More than practitioners, such flexibility is even more important for compiler researchers and auto-tuner developers. In addition, tools must also raise the level of abstraction by representing and manipulating reductions and scans explicitly. And third, the tools must also be able to explore transformation choices that consider memory (re)-allocation.
AlphaZ is a system that allows exploration of optimizing transformations in the polyhedral model that meets these goals. We illustrate its power through two examples of optimizations that existing parallelization tools cannot perform, but can be systematically applied using our system. One is time-tiling of a code from PolyBench that resembles the Alternating Direction Implicit (ADI) method, and the other is a transformation that brings down the algorithmic complexity of a kernel in UNAfold, a sequence alignment software, from O(N^4) to O(N^3).
@INPROCEEDINGS{YUKI2012ALPHAZ,
TITLE = {{AlphaZ}: A System for Design Space Exploration in the Polyhedral Model},
AUTHOR = {Yuki, Tomofumi and Gupta, Gautam and Kim, DaeGon and Pathan, Tanveer and Rajopadhye, Sanjay},
BOOKTITLE = {Proceedings of the 25th International Workshop on Languages and Compilers for Parallel Computing},
YEAR = {2012},
URL = {http://people.rennes.inria.fr/Tomofumi.Yuki/papers/yuki-lcpc2012.pdf},
ABSTRACT = {
The polyhedral model is now a well established and effective formalism for
program optimization and parallelization. However, finding optimal
transformations is a long-standing open problem. It is therefore important to
develop tools that, rather than following predefined optimization criteria,
allow practitioners to explore different choices through script-driven or
user-guided transformations. More than practitioners, such flexibility is even
more important for compiler researchers and auto-tuner developers. In addition,
tools must also raise the level of abstraction by representing and manipulating
reductions and scans explicitly. And third, the tools must also be able to
explore transformation choices that consider memory (re)-allocation.
AlphaZ is a system that allows exploration of optimizing transformations in the
polyhedral model that meets these goals. We illustrate its power through two
examples of optimizations that existing parallelization tools cannot perform,
but can be systematically applied using our system. One is time-tiling of a
code from PolyBench that resembles the Alternating Direction Implicit (ADI)
method, and the other is a transformation that brings down the algorithmic
complexity of a kernel in UNAfold, a sequence alignment software, from O(N^4)
to O(N^3).}
}
@MISC{POUCHET2011POLYOPT,
TITLE = {Polyopt, a polyhedral optimizer for the rose compiler},
AUTHOR = {Pouchet, Louis-No{\"e}l},
YEAR = {2011},
PUBLISHER = {July}
}
We present an interactive tool, called iscc, for manipulating sets and relations of integer tuples bounded by aliasne constraints over the set variables, parameters and existentially quantified variables. A distinguishing feature of iscc is that it provides a cardinality operation on sets and relations that computes a symbolic expression in terms of the parameters and domain variables for the number of elements in the set or the image of the relation. In particular, these expressions are piecewise quasipolynomials, which can be further manipulated in iscc. Besides basic operations on sets and piecewise quasipolynomials, iscc also provides an interface to code generation, lexicographic optimization, dependence analysis, transitive closures and the symbolic computation of upper bounds and sums of piecewise quasipolynomials over their domains.
@INPROCEEDINGS{VERDOOLAEGE2011IMPACT,
AUTHOR = {Verdoolaege, Sven},
TITLE = {Counting Affine Calculator and Applications},
BOOKTITLE = {1st International Workshop on Polyhedral Compilation Techniques (IMPACT)},
YEAR = {2011},
EDITOR = {C. Alias and C. Bastoul},
ADDRESS = {Chamonix, France},
ABSTRACT = {
We present an interactive tool, called iscc, for manipulating sets and
relations of integer tuples bounded by aliasne constraints over the set
variables, parameters and existentially quantified variables. A distinguishing
feature of iscc is that it provides a cardinality operation on sets and
relations that computes a symbolic expression in terms of the parameters and
domain variables for the number of elements in the set or the image of the
relation. In particular, these expressions are piecewise quasipolynomials,
which can be further manipulated in iscc. Besides basic operations on sets and
piecewise quasipolynomials, iscc also provides an interface to code generation,
lexicographic optimization, dependence analysis, transitive closures and the
symbolic computation of upper bounds and sums of piecewise quasipolynomials
over their domains.
},
URL = {http://perso.ens-lyon.fr/christophe.alias/impact2011/impact-05.pdf}
}
Ubiquitous multicore architectures require that many levels of parallelism have to be found in codes. Dependence analysis is the main approach in compilers for the detection of parallelism. It enables vectorisation and automatic parallelisation, among many other optimising transformations, and is therefore of crucial importance for optimising compilers. This paper presents new open source software, \textttFADAlib, performing an instance-wise dataflow analysis for scalar and array references. The software is a C++ implementation of the Fuzzy Array Dataflow Analysis (FADA) method. This method can be applied on codes with irregular control such as \textttwhile-loops, \textttif-then-else or non-regular array accesses, and computes exact instance-wise dataflow analysis on regular codes. As far as we know, \textttFADAlib is the first released open source C++ implementation of instance-wise data flow dependence handling larger classes of programs. In addition, the library is technically independent from an existing compiler; It can be plugged in many of them; this article shows an example of a successful integration inside gcc/GRAPHITE. We give details concerning the library implementation and then report some initial results with gcc and possible use for trace scheduling on irregular codes.
@ARTICLE{BELAOUCHA2010FADALIB,
TITLE = {FADAlib: an open source C++ library for fuzzy array dataflow analysis},
AUTHOR = {Belaoucha, Marouane and Barthou, Denis and Eliche, Adrien and Touati, Sid-Ahmed-Ali},
JOURNAL = {Procedia Computer Science},
VOLUME = {1},
NUMBER = {1},
PAGES = {2075--2084},
YEAR = {2010},
PUBLISHER = {Elsevier},
URL = {http://www-sop.inria.fr/members/Sid.Touati/publis/Touati-PAPP2010.pdf},
ABSTRACT = {
Ubiquitous multicore architectures require that many levels of parallelism have
to be found in codes. Dependence analysis is the main approach in compilers for
the detection of parallelism. It enables vectorisation and automatic
parallelisation, among many other optimising transformations, and is therefore
of crucial importance for optimising compilers. This paper presents new open
source software, \texttt{FADAlib}, performing an instance-wise dataflow
analysis for scalar and array references. The software is a C++ implementation
of the Fuzzy Array Dataflow Analysis (FADA) method. This method can be applied
on codes with irregular control such as \texttt{while}-loops,
\texttt{if-then-else} or non-regular array accesses, and computes exact
instance-wise dataflow analysis on regular codes. As far as we know,
\texttt{FADAlib} is the first released open source C++ implementation of
instance-wise data flow dependence handling larger classes of programs. In
addition, the library is technically independent from an existing compiler; It
can be plugged in many of them; this article shows an example of a successful
integration inside gcc/GRAPHITE. We give details concerning the library
implementation and then report some initial results with gcc and possible use
for trace scheduling on irregular codes.
}
}
Loop fusion has been studied extensively, but in a manner isolated from other transformations. This was mainly due to the lack of a powerful intermediate representation for application of compositions of high-level transformations. Fusion presents strong interactions with parallelism and locality. Currently, there exist no models to determine good fusion structures integrated with all components of an auto-parallelizing compiler. This is also one of the reasons why all the benefits of optimization and automatic parallelization of long sequences of loop nests spanning hundreds of lines of code have never been explored.
We present a fusion model in an integrated automatic parallelization framework that simultaneously optimizes for hardware prefetch stream buffer utilization, locality, and parallelism. Characterizing the legal space of fusion structures in the polyhedral compiler framework is not difficult. However, incorporating useful optimization criteria into such a legal space to pick good fusion structures is very hard. The model we propose captures utilization of hardware prefetch streams, loss of parallelism, as well as constraints imposed by privatization and code expansion into a single convex optimization space. The model scales very well to program sections spanning hundreds of lines of code. It has been implemented into the polyhedral pass of the IBM XL optimizing compiler. Experimental results demonstrate its effectiveness in finding good fusion structures for codes including SPEC benchmarks and large applications. An improvement ranging from 5% to nearly a factor of 2.75x is obtained over the current production compiler optimizer on these benchmarks.
@INPROCEEDINGS{BONDHUGULA2010XLC,
AUTHOR = {Bondhugula, Uday and
G{\"u}nl{\"u}k, Oktay and
Dash, Sanjeeb and
Renganarayanan, Lakshminarayanan},
TITLE = {A model for fusion and code motion in an automatic parallelizing
compiler},
BOOKTITLE = {International Conference on Parallel Architectures and Compilation Techniques (PACT)},
YEAR = {2010},
PAGES = {343-352},
URL = {http://drona.csa.iisc.ernet.in/~uday/publications/uday-pact10.pdf},
ABSTRACT = {
Loop fusion has been studied extensively, but in a manner isolated from other
transformations. This was mainly due to the lack of a powerful intermediate
representation for application of compositions of high-level transformations.
Fusion presents strong interactions with parallelism and locality. Currently,
there exist no models to determine good fusion structures integrated with all
components of an auto-parallelizing compiler. This is also one of the reasons
why all the benefits of optimization and automatic parallelization of long
sequences of loop nests spanning hundreds of lines of code have never been
explored.
We present a fusion model in an integrated automatic parallelization framework
that simultaneously optimizes for hardware prefetch stream buffer utilization,
locality, and parallelism. Characterizing the legal space of fusion structures
in the polyhedral compiler framework is not difficult. However, incorporating
useful optimization criteria into such a legal space to pick good fusion
structures is very hard. The model we propose captures utilization of hardware
prefetch streams, loss of parallelism, as well as constraints imposed by
privatization and code expansion into a single convex optimization space. The
model scales very well to program sections spanning hundreds of lines of code.
It has been implemented into the polyhedral pass of the IBM XL optimizing
compiler. Experimental results demonstrate its effectiveness in finding good
fusion structures for codes including SPEC benchmarks and large applications.
An improvement ranging from 5% to nearly a factor of 2.75x is obtained over the
current production compiler optimizer on these benchmarks.
}
}
Modern compilers are responsible for adapting the semantics of source programs into a form that makes efficient use of a highly complex, heterogeneous machine. This adaptation amounts to solve an optimization problem in a huge and unstructured search space, while predicting the performance outcome of complex sequences of program transformations. The polyhedral model of compilation is aimed at these challenges. Its geometrical, non-inductive semantics enables the construction of better-structured optimization problems and precise analytical models. Recent work demonstrated the scalability of the main polyhedral algorithms to real-world programs. Its integration into production compilers is under way, pioneered by the Graphite branch of the GNU Compiler Collection (GCC). Two years after the effective beginning of the project, this paper reports on original questions and innovative solutions that arose during the design and implementation of Graphite.
@INPROCEEDINGS{TRIFINOVIC2010,
AUTHOR = {Trifunovi\'c, Konrad and Cohen, Albert and Edelsohn, David
and Li, Feng and Grosser, Tobias and Jagasia, Harsha
and Ladelsky, Razya and Pop, Sebastian and Sj\"odin, Jan
and Upadrasta, Ramakrishna },
TITLE = {{GRAPHITE} Two Years After: First Lessons Learned From
Real-World Polyhedral Compilation},
BOOKTITLE = {2nd GCC Research Opportunities Workshop (GROW)},
YEAR = {2010},
URL = {http://citeseerx.ist.psu.edu/viewdoc/download?rep=rep1&type=pdf&doi=10.1.1.220.3386},
ABSTRACT = {
Modern compilers are responsible for adapting the semantics of source programs
into a form that makes efficient use of a highly complex, heterogeneous
machine. This adaptation amounts to solve an optimization problem in a huge and
unstructured search space, while predicting the performance outcome of complex
sequences of program transformations. The polyhedral model of compilation is
aimed at these challenges. Its geometrical, non-inductive semantics enables the
construction of better-structured optimization problems and precise analytical
models. Recent work demonstrated the scalability of the main polyhedral
algorithms to real-world programs. Its integration into production compilers is
under way, pioneered by the Graphite branch of the GNU Compiler Collection
(GCC). Two years after the effective beginning of the project, this paper
reports on original questions and innovative solutions that arose during the
design and implementation of Graphite.
}
}
@INPROCEEDINGS{VERDOOLAEGE2010ISL,
AUTHOR = {Verdoolaege, Sven},
TITLE = {isl: An Integer Set Library for the Polyhedral Model},
BOOKTITLE = {Mathematical Software (ICMS'10)},
YEAR = {2010},
PAGES = {299--302},
SERIES = {LNCS 6327},
EDITOR = {Fukuda, Komei and Hoeven, Joris and Joswig, Michael and
Takayama, Nobuki},
PUBLISHER = {Springer-Verlag}
}
Since its inception as a student project in 2001, initially just for the handling (as the name implies) of convex polyhedra, the Parma Polyhedra Library has been continuously improved and extended by joining scrupulous research on the theoretical foundations of (possibly non-convex) numerical abstractions to a total adherence to the best available practices in software development. Even though it is still not fully mature and functionally complete, the Parma Polyhedra Library already offers a combination of functionality, reliability, usability and performance that is not matched by similar, freely available libraries. In this paper, we present the main features of the current version of the library, emphasizing those that distinguish it from other similar libraries and those that are important for applications in the field of analysis and verification of hardware and software systems.
@ARTICLE{BAGNARA2008PARMA,
TITLE = {The Parma Polyhedra Library: Toward a complete set of numerical abstractions for the analysis and verification of hardware and software systems},
AUTHOR = {Bagnara, Roberto and Hill, Patricia M and Zaffanella, Enea},
JOURNAL = {Science of Computer Programming},
VOLUME = {72},
NUMBER = {1},
PAGES = {3--21},
YEAR = {2008},
PUBLISHER = {Elsevier},
URL = {http://bugseng.com/products/ppl/documentation/BagnaraHZ08SCP.pdf},
ABSTRACT = {
Since its inception as a student project in 2001, initially just for the
handling (as the name implies) of convex polyhedra, the Parma Polyhedra Library
has been continuously improved and extended by joining scrupulous research on
the theoretical foundations of (possibly non-convex) numerical abstractions to
a total adherence to the best available practices in software development. Even
though it is still not fully mature and functionally complete, the Parma
Polyhedra Library already offers a combination of functionality, reliability,
usability and performance that is not matched by similar, freely available
libraries. In this paper, we present the main features of the current version
of the library, emphasizing those that distinguish it from other similar
libraries and those that are important for applications in the field of
analysis and verification of hardware and software systems.
}
}
@MISC{BASTOUL2008CLAN,
TITLE = {Clan - A polyhedral representation extractor for high level programs},
AUTHOR = {Bastoul, Cedric},
YEAR = {2008}
}
@ARTICLE{BONDHUGULA2008PLUTO,
AUTHOR = {Bondhugula, Uday and Hartono, Albert and Ramanujam, J. and Sadayappan, P.},
TITLE = {A practical automatic polyhedral parallelizer and locality optimizer},
JOURNAL = {SIGPLAN Notices},
VOLUME = {43},
NUMBER = {6},
YEAR = {2008},
ISSN = {0362-1340},
PAGES = {101--113},
DOI = {http://doi.acm.org/10.1145/1379022.1375595},
PUBLISHER = {ACM},
ADDRESS = {New York, NY, USA}
}
This paper describes a general and robust loop transformation framework that enables compilers to generate efficient code on complex loop nests. Despite two decades of prior research on loop optimization, performance of compiler-generated code often falls short of manually optimized versions, even for some well-studied BLAS kernels. There are two primary reasons for this. First, today’s compilers employ fixed transformation strategies, making it difficult to adapt to different optimization requirements for different application codes. Second, code transformations are treated in isolation, not taking into account the interactions between different transformations. This paper addresses such limitations in a unified framework that supports a broad collection of transformations, (permutation, tiling, unroll-and-jam, data copying, iteration space splitting, fusion, distribution and others), which go beyond existing polyhedral transformation models. This framework is a key element of a compiler we are developing which performs empirical optimization to evaluate a collection of alternative optimized variants of a code segment. A script interface to code generation and empirical search permits transformation parameters to be adjusted independently and tested; alternative scripts are used to represent different code variants. By applying this framework to example codes, we show performance results on automaticallygenerated code for the Pentium M and MIPS R10000 that are comparable to the best hand-tuned codes, and significantly better (up to a 14x speedup) than the native compilers.
@TECHREPORT{CHEN2008CHILL,
AUTHOR = {Chen, Chun and Chame, Jacqueline and Hall, Mary},
TITLE = {A framework for composing high-level loop transformations},
INSTITUTION = {USC},
YEAR = {2008},
URL = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.214.8396&rep=rep1&type=pdf},
ABSTRACT = {
This paper describes a general and robust loop transformation framework that
enables compilers to generate efficient code on complex loop nests. Despite two
decades of prior research on loop optimization, performance of
compiler-generated code often falls short of manually optimized versions, even
for some well-studied BLAS kernels. There are two primary reasons for this.
First, today’s compilers employ fixed transformation strategies, making it
difficult to adapt to different optimization requirements for different
application codes. Second, code transformations are treated in isolation, not
taking into account the interactions between different transformations. This
paper addresses such limitations in a unified framework that supports a broad
collection of transformations, (permutation, tiling, unroll-and-jam, data
copying, iteration space splitting, fusion, distribution and others), which go
beyond existing polyhedral transformation models. This framework is a key
element of a compiler we are developing which performs empirical optimization
to evaluate a collection of alternative optimized variants of a code segment. A
script interface to code generation and empirical search permits transformation
parameters to be adjusted independently and tested; alternative scripts are
used to represent different code variants. By applying this framework to
example codes, we show performance results on automaticallygenerated code for
the Pentium M and MIPS R10000 that are comparable to the best hand-tuned codes,
and significantly better (up to a 14x speedup) than the native compilers.
}
}
@ARTICLE{SCHWEITZ2006RSTREAM,
TITLE = {R-stream: A parametric high level compiler},
AUTHOR = {Schweitz, Eric and Lethin, Richard and Leung, Allen and Meister, Benoit},
JOURNAL = {Proceedings of HPEC},
YEAR = {2006},
URL = {http://llwww.ll.mit.edu/HPEC/agendas/proc06/Day2/21_Schweitz_Pres.pdf}
}
Many advances in automatic parallelization and optimization have been achieved through the polyhedral model. It has been extensively shown that this computational model provides convenient abstractions to reason about and apply program transformations. Nevertheless, the complexity of code generation has long been a deterrent for using polyhedral representation in optimizing compilers. First, code generators have a hard time coping with generated code size and control overhead that may spoil theoretical benefits achieved by the transformations. Second, this step is usually time consuming, hampering the integration of the polyhedral framework in production compilers or feedback-directed, iterative optimization schemes. Moreover, current code generation algorithms only cover a restrictive set of possible transformation functions. This paper discusses a general transformation framework able to deal with non-unimodular, non-invertible, non-integral or even non-uniform functions. It presents several improvements to a state-of-the-art code generation algorithm. Two directions are explored: generated code size and code generator efficiency. Experimental evidence proves the ability of the improved method to handle real-life problems.
@INPROCEEDINGS{BASTOUL2004CLOOG,
AUTHOR = {Bastoul, C\'{e}dric},
TITLE = {Code Generation in the Polyhedral Model Is Easier Than You Think},
BOOKTITLE = {IEEE International Conference on Parallel Architecture
and Compilation Techniques},
YEAR = {2004},
PAGES = {7--16},
MONTH = {September},
ADDRESS = {Juan-les-Pins, France},
ABSTRACT = {
Many advances in automatic parallelization and optimization have been
achieved through the polyhedral model. It has been extensively shown that this
computational model provides convenient abstractions to reason about and apply
program transformations. Nevertheless, the complexity of code generation has
long been a deterrent for using polyhedral representation in optimizing
compilers. First, code generators have a hard time coping with generated code
size and control overhead that may spoil theoretical benefits achieved by the
transformations. Second, this step is usually time consuming, hampering the
integration of the polyhedral framework in production compilers or
feedback-directed, iterative optimization schemes. Moreover, current code
generation algorithms only cover a restrictive set of possible transformation
functions. This paper discusses a general transformation framework able to
deal with non-unimodular, non-invertible, non-integral or even non-uniform
functions. It presents several improvements to a state-of-the-art code
generation algorithm. Two directions are explored: generated code size and
code generator efficiency. Experimental evidence proves the ability of the
improved method to handle real-life problems.
}
}
@MISC{LOECHNER1999POLYLIB,
TITLE = {PolyLib: A library for manipulating parameterized polyhedra},
AUTHOR = {Loechner, Vincent},
YEAR = {1999},
URL = {https://repo.or.cz/polylib.git/blob_plain/HEAD:/doc/parampoly-doc.ps.gz}
}
@INPROCEEDINGS{GRIEBL1996LOOPO,
TITLE = {The loop parallelizer LooPo},
AUTHOR = {Griebl, Martin and Lengauer, Christian},
BOOKTITLE = {Proc. Sixth Workshop on Compilers for Parallel Computers},
VOLUME = {21},
PAGES = {311--320},
YEAR = {1996},
ORGANIZATION = {Citeseer},
URL = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.30.6700&rep=rep1&type=pdf}
}
@ARTICLE{KELLY1996OMEGA,
TITLE = {The {O}mega Calculator and Library, Version 1.1.0},
AUTHOR = {Kelly, Wayne and Maslov, Vadim and Pugh, William and Rosser, Evan and Shpeisman, Tatiana and Wonnacott, Dave},
YEAR = {1996},
URL = {http://www.cs.utah.edu/~mhall/cs6963s09/lectures/omega.ps}
}
@ARTICLE{KELLY1995PETIT,
TITLE = {New user interface for Petit and other interfaces: user guide},
AUTHOR = {Kelly, W and Maslov, V and Pugh, W and Rosser, E and Shpeisman, T and Wonnacott, D},
JOURNAL = {University of Maryland},
YEAR = {1995}
}