diff --git a/main.typ b/main.typ index e2f1cc4..1703b8e 100644 --- a/main.typ +++ b/main.typ @@ -53,18 +53,8 @@ #it.body ] #show figure.where(kind: "raw"): set figure(supplement: [Listing]) -#show figure.where(kind: "raw"): it => align(left)[ - #v(8pt, weak: true) - #it.body - #v(4pt, weak: true) - #it.caption - #v(8pt, weak: true) -] -#show figure: it => [ - #v(1.25em, weak: true) - #it - #v(1.25em, weak: true) -] +#show figure.where(kind: "raw"): it => align(left, { v(8pt, weak: true); it.body; v(4pt, weak: true); it.caption; v(8pt, weak: true) }) +#show figure: it => { v(1.25em, weak: true); it; v(1.25em, weak: true) } #show ref: it => { let el = it.element if el != none { @@ -79,15 +69,9 @@ } return it } -#show heading.where(level: 1): it => [ - #v(4mm, weak: true) - #it -] -#show heading.where(level: 2): it => [ - #v(2mm, weak: true) - #it -] - +#show heading.where(level: 1): it => { v(4mm, weak: true); it } +#show heading.where(level: 2): it => { v(2mm, weak: true); it } +#show heading.where(level: 3): it => { v(1.5mm, weak: true); it } #show: alexandria(prefix: "x-", read: p => read(p)) #show: alexandria(prefix: "y-", read: p => read(p)) #let cgx = it => citegroup(it, prefix: "x-") @@ -113,7 +97,7 @@ This work is divided into the following sections: in @background_c the backgroun = Background == Static Single Assignment form (SSA / SSA form) #figure( // ssa_form_example - caption: [C code and respective SSA in control flow graph form, adapted from Fig.1 in the work of Reissmann, Meyer and Soffa @y-reissmann_rvsdg_2020], + caption: [C code and respective SSA in control flow graph form, adapted from Fig. 1 in the work of Reissmann, Meyer and Soffa @y-reissmann_rvsdg_2020], kind: "raw", grid( columns: (1fr, 1.25fr), @@ -149,12 +133,27 @@ Backward analysis goes from the exit of the program to the entry, thereby enabli ``` ) The facts which the algorithm knows about variable either must be true or may be true. When they must be true, every path leading to the current point must ensure that the fact is true. The facts in @must_may_example on line 4 are: `x` and `z` must be initialized since that is done in both branches of the if, while `y` only may be initialized. `x` also must be `2`, since it gets assigned `2` in both branches, `z` may be `2` or may be `1`. The must constraints are mostly used for optimization, while the may constraints are mostly used for showing warnings. +=== Points-to analysis @x-rayside_pointsto_2005 +Points-to analysis is for handling DFA with pointers and references. Specifically, it show if one variable can point to another variable during the execution of the program. Points-to analysis has multiple levels of precision. \ +One of the most important aspects for precision is context-sensitivity. If you have a function `void* id(void* p)` which just returns the pointer `p` you give it, in context-insensitive points-to analysis you would get that every pointer which is supplied as argument, could be return as results, while with context-sensitive analysis you would only get the pointer which you supplied. As example: with the code `void* a, b; id(a); id(b);` you would get `id(a), id(b) ∈ {a, b}` because the analysis could not differentiate between those calls, while with context-sensitive analysis it would be `id(a) == a` and `id(b) == b`. \ +There many design choices, which impact the performance and the precision, that can be made when implementing points-to analysis: +Subset-based analysis where each pointer has a set of variables to which it can point. When pointer `a` is assigned to pointer `b` (`b = a;`) the variables which `a` points to must be a subset of `b` (`b ⊇ a`). Later these sets can be merged for faster analysis, but this leads to information loss. +A more precise variation is equivalence-based points-to analysis. This works by having a separate set for each pointer and copying these sets when assigning pointers to other pointers. Because the analysis needs to keep a set for every pointer, it is much slower and requires more memory during analysis. +A even more precise method and most relevant to this paper is flow-sensitive analysis. By analyzing the control flow it is possible to precisely define to which variable a pointer points to at a certain time in the code and to make optimizations based on that. Then drawback of this is the bad performance of the analysis and the complicated implementation. \ +While subset-based and equivalence-based analysis is enough for simple optimizations and simple compile time checks, for safety critical applications and complex optimizations it is necessary to use context and flow sensitive algorithms. It is also necessary to make this choice based on the size of the analyzed codebase and how long the compile time should be. +// TODO summary-based === Constant folding and propagation @x-optimizing_compiler_wikipedia An example based on @ssa_form_example would be the compiler calculating $x_1$ to be $8$. This is called constant folding and done by replacing all calculations which are possible at compile time with their result. Constant propagation then replaces the $x_1$ in the calculation of $x_2$ with its value. When constant folding is the applied again $x_2$ would be $6$. -=== Dead branch elimination @x-optimizing_compiler_wikipedia +=== Conditional branch elimination Continuing from the last optimization it would be possible to conclude that the branch $x_2 < 4$ would always evaluate to $0$. This would result in elimination of the $1$ branch and result in $x_5$ always being $12$ and $y_1 = 24$. These two optimizations would already allow to replace the whole code from @ssa_form_example with `int x = 12; int y = 24;`. In this simple simple example this seems obvious, but if $x$ is a function parameter there still could be instances where this branch could be eliminated because of the function argument supplied elsewhere. === Common subexpression elimination @x-optimizing_compiler_wikipedia Common subexpression elimination is done by finding cases where a calculation or a function call without side-effect is done multiple times with the same variables and values. An example for this would be the expressions `d = (a * b) - c * (a * b)`, which can be rewritten to `tmp = a * b; d = tmp - c * tmp` as long as `a` and `b` remain the same between the two calculations of `a * b`. +=== Copy propagation @x-copy_propagation_wikipedia +Copy propagation is done by replacing variables which were directly assigned with another variable with their value. Example: `y = x; z = 3 + y;` would be replaced with `z = 3 + x;` since `y` got directly assigned with `x`. An important thing to watch out for is that `x` does not change between both lines. +=== Dead code elimination @x-optimizing_compiler_wikipedia +Dead code elimination removes code which would not be executed under any circumstance or code which modifies variables that will not be used again. Code which would not be executed would be after statements which return the control flow, for example: `break`, `continue`, `return`, `goto`. +=== Dead branch elimination +Dead branch elimination is a simpler version of conditional branch elimination where the condition of the branch is a constant false, i.e. `if (0) {...}`. This could be because of parts of the code which were commented out this way for debugging purpose. = Methodology This work is created following the process described in @process_fig. The protocol for the review is divided up into the following chapters: for the objective of the research see @research_questions_s, for the search strategy see @sas_s, for the selection criteria see @selection_criteria_s and for the data extraction strategy see @data_extraction_s. @@ -280,12 +279,6 @@ _EC1_ is to exclude publications which talk about DFA in other contexts which ar _EC2--EC5_ are to exclude publications which do not provide enough information to include them in this publication. == Data extraction -Based on the research questions, we collected 9 data items to extract from all included publications. @data_extraction_table lists all data items. \ -Data items _D1--D3_ are to document the source of the publication. \ -_D4_ and _D5_ are to explicitly list the advantages and disadvantages for answering _RQ1_. \ -_D6_ and _D7_ show in which compiler DFA was implemented and if it is running directly on a programming language like C++ or if it runs on a intermediate language like LLVM IR. \ -_D8_ lists which optimizations where performed based on the results of DFA and _D9_ lists the limitations of the executed DFA. (e.g., only run on function scope). \ -All data items were extracted from the full text of all included publications. #[ // data_extraction_table #set par(leading: 0.4em) #set text(size: 9pt) @@ -311,6 +304,12 @@ All data items were extracted from the full text of all included publications. ) ) ] +Based on the research questions, we collected 9 data items to extract from all included publications. @data_extraction_table lists all data items. \ +Data items _D1--D3_ are to document the source of the publication. \ +_D4_ and _D5_ are to explicitly list the advantages and disadvantages for answering _RQ1_. \ +_D6_ and _D7_ show in which compiler DFA was implemented and if it is running directly on a programming language like C++ or if it runs on a intermediate language like LLVM IR. \ +_D8_ lists which optimizations where performed based on the results of DFA and _D9_ lists the limitations of the executed DFA. (e.g., only run on function scope). \ +All data items were extracted from the full text of all included publications. = Findings In this chapter we list our findings from the conducted systematic literature analysis. @@ -350,42 +349,6 @@ The work by Zaidi and Greaves @y-zaidi_value_2015 shows that dataflow analysis i } ) As seen in @demographic_pub_year most of the analyzed publication are from the last 15 years, which indicates that this field is still actively being researched and explore, but research has already start back in 1983. Since research started over 50 years ago it indicates that this field is by now well-established. There are definitely more publications which are not listed here and not represented in this figure, but that is because the scope of this papers was very limited. \ -=== Research focus -#figure( // demographic_research_focus - caption: "Research focus of the publications", - { - let data = ( - ("Algorithms and Techniques", 5), // 1, 2, 5, 7, 12 - ("Implementation and Reusability", 2), // 3, 8 - ("Analysis speed improvement", 4), // 4, 6, 14, 15 - ("Custom IR for analysis", 3), // 9, 10, 13 - ("Tools for implementation of DFA", 1), // 11 - ) - - cetz.canvas({ - //let colors = (red, eastern, green, blue, navy, purple, maroon, orange) - let colors = gradient.linear(..color.map.rainbow.map(v => v.darken(20%).saturate(20%))) - - // cspell:disable-next-line - cetz_chart.piechart( - data, - value-key: 1, - label-key: 0, - radius: 3, - slice-style: colors, - inner-radius: 0, - inner-label: (content: (value, _) => [#text(white, str(value))], radius: 150%), - outer-label: (content: (value, _) => [], radius: 0%), - legend: ( - position: "south", - anchor: "north", - orientation: ttb - ) - ) - }) - } -) -The focus of the different papers can be seen in @demographic_research_focus. Most of the papers #cgy[@y-kildall_unified_1973 @y-rastislav_bodik_interprocedural_1997 @y-joisha_technique_2011 @y-tang_summary-based_2012 @y-pathade_path_2019] focus on creating and implementing new algorithms and techniques. Another big focus of the included papers is speeding up the analysis, which also makes it more viable for using in JIT compilers. While #cgy[@y-edvinsson_multi-threaded_2010 @y-edvinsson_parallel_2011] try to do this by simply running parts of the analysis on different threads, the work by Shi and Zhang @y-shi_pipelining_2020 tries to pipeline the analysis of functions and the work by Aigner, Barany and Mössenböck @y-aigner_lazy_2024 tries to skip parts by only lazily iterating over nodes of the IR. In the works of Duboscq et al., Zaidi et al., Reissmann et al. #cgy[@y-duboscq_graal_2013 @y-zaidi_value_2015 @y-reissmann_rvsdg_2020] a custom IR is implemented to make it easier to run parts of the DFA or to have a better structure then the previous code or IR. The focus of the work by Ramsey et al. @y-ramsey_hoopl_2010 is to provide a generic library for implementing DFA and using it and the work of Urban and Steinlechner @y-urban_implementing_2013 is to provide an example implementation of the library to show how it works. The work by Ginsbach, Crawford and O'Boyle @y-ginsbach_candl_2018 defines and implements a compiler for a Domain-Specific Language (DSL) defined in the paper to implement DFA algorithms in the LLVM framework to make it easier for researchers to try out new ideas and implement them. === Target languages #figure( // demographic_target_lang caption: "Target languages of the publications", @@ -425,7 +388,42 @@ The focus of the different papers can be seen in @demographic_research_focus. Mo } ) @demographic_target_lang shows a 33% trend towards implementing DFA optimizations either with LLVM directly or by operating on the LLVM IR, while Java is either directly used as bytecode or as SSA representation of Java. This shows that LLVM is a good platform for implementing optimizations and that it has a lower barrier of entry for developing optimizations. +=== Research focus +#figure( // demographic_research_focus + caption: "Research focus of the publications", + { + let data = ( + ("Algorithms and Techniques", 5), // 1, 2, 5, 7, 12 + ("Implementation and Reusability", 2), // 3, 8 + ("Analysis speed improvement", 4), // 4, 6, 14, 15 + ("Custom IR for analysis", 3), // 9, 10, 13 + ("Tools for implementation of DFA", 1), // 11 + ) + + cetz.canvas({ + //let colors = (red, eastern, green, blue, navy, purple, maroon, orange) + let colors = gradient.linear(..color.map.rainbow.map(v => v.darken(20%).saturate(20%))) + // cspell:disable-next-line + cetz_chart.piechart( + data, + value-key: 1, + label-key: 0, + radius: 3, + slice-style: colors, + inner-radius: 0, + inner-label: (content: (value, _) => [#text(white, str(value))], radius: 150%), + outer-label: (content: (value, _) => [], radius: 0%), + legend: ( + position: "south", + anchor: "north", + orientation: ttb + ) + ) + }) + } +) +The focus of the different papers can be seen in @demographic_research_focus. Most of the papers #cgy[@y-kildall_unified_1973 @y-rastislav_bodik_interprocedural_1997 @y-joisha_technique_2011 @y-tang_summary-based_2012 @y-pathade_path_2019] focus on creating and implementing new algorithms and techniques. Another big focus of the included papers is speeding up the analysis, which also makes it more viable for using in JIT compilers. While #cgy[@y-edvinsson_multi-threaded_2010 @y-edvinsson_parallel_2011] try to do this by simply running parts of the analysis on different threads, the work by Shi and Zhang @y-shi_pipelining_2020 tries to pipeline the analysis of functions and the work by Aigner, Barany and Mössenböck @y-aigner_lazy_2024 tries to skip parts by only lazily iterating over nodes of the IR. In the works of Duboscq et al., Zaidi et al., Reissmann et al. #cgy[@y-duboscq_graal_2013 @y-zaidi_value_2015 @y-reissmann_rvsdg_2020] a custom IR is implemented to make it easier to run parts of the DFA or to have a better structure then the previous code or IR. The focus of the work by Ramsey et al. @y-ramsey_hoopl_2010 is to provide a generic library for implementing DFA and using it and the work of Urban and Steinlechner @y-urban_implementing_2013 is to provide an example implementation of the library to show how it works. The work by Ginsbach, Crawford and O'Boyle @y-ginsbach_candl_2018 defines and implements a compiler for a Domain-Specific Language (DSL) defined in the paper to implement DFA algorithms in the LLVM framework to make it easier for researchers to try out new ideas and implement them. == RQ1: Advantages and disadvantages of using Dataflow analysis for compiler optimization DFA makes many big compiler optimizations possible but it also brings many trade-offs and not just for performance. These optimizations eliminate unused code and simplify expressions, which reduces execution time and memory footprint during runtime. @@ -436,7 +434,7 @@ Later publications by Rastilav et al. and Joisha et al. #cgy[@y-rastislav_bodik_ While performance is not the biggest concern for DFA, since it runs at compile-time and accuracy is more important as described in the work by Edvinsson and Löwe @y-edvinsson_multi-threaded_2010. Many publications #cgy[@y-edvinsson_multi-threaded_2010 @y-edvinsson_parallel_2011 @y-shi_pipelining_2020 @y-aigner_lazy_2024] have investigated how to improve the performance of DFA. This is done with several techniques: In both publications by Edvinsson, Löwe and Lundberg #cgy[@y-edvinsson_multi-threaded_2010 @y-edvinsson_parallel_2011] different function calls are run on different threads, but it has the problem of creating and queue a task for each function, which can lead to a big overhead. In later work by Edvinsson, Löwe and Lundberg @y-edvinsson_parallel_2011 independent branches are also run on separate threads. A big problem with both approaches is to avoid, that some functions could be queued for analysis be more than one thread, which leads to unnecessary redundancy. \ // TODO explain detailed how function are selected Another approach described in the work by Shi and Zhang @y-shi_pipelining_2020 is to pipeline the function calls. This is done by analyzing all variables, which do not depend on any function calls. When the function calls have finished being analyzed, the variables, which depend on that function call are analyzed. Thereby more parallel work is possible. === Implementation complexity -Another problem with DFA is the difficulty to implement optimizations with it, which is explained in the works by Ramsey et al. and Ginsbach et al. #cgy[@y-ramsey_hoopl_2010 @y-ginsbach_candl_2018]. DFA is often also deeply entangled with the compiler internals, which makes it difficult to reuse existing optimizations with other compilers or implement new optimizations quickly and it is complicated to implemented, as seen in LLVM: "simple peephole optimizations in the LLVM instcombine pass contain approximately 30000 lines of complex C++ code, despite the transformations being simple" @y-ginsbach_candl_2018 \ // TODO fix cite +Another problem with DFA is the difficulty to implement optimizations with it, which is explained in the works by Ramsey et al. and Ginsbach et al. #cgy[@y-ramsey_hoopl_2010 @y-ginsbach_candl_2018]. DFA is often also deeply entangled with the compiler internals, which makes it difficult to reuse existing optimizations with other compilers or implement new optimizations quickly and it is complicated to implemented, as seen in LLVM: "simple peephole optimizations in the LLVM instcombine pass contain approximately 30000 lines of complex C++ code, despite the transformations being simple" (@y-ginsbach_candl_2018, p. 151) \ One solutions to this problem is described in the work by Ramsey, Dias and Peyton-Jones @y-ramsey_hoopl_2010 by implementing a library in Haskell which performs the dataflow analysis and provides an interface, which "is made possible by sophisticated aspects of Haskell's type system, such as higher-rank polymorphism, GADTs, and type functions" @y-ramsey_hoopl_2010, to implement various optimizations, which also then can be reused for other compilers. The biggest drawback of this library is it's limited to compilers implemented in Haskell. \ Another approach is described by the work of Ginsbach, Crawford and O'Boyle @y-ginsbach_candl_2018 by creating a domain specific language to implement LLVM optimization passes. This is done by a having a simple language for directly implementing the logic of the optimization, while a custom transpiler then converts it into a LLVM pass written in C++. Since the LLVM pass is implemented in a more generic way to fit this purpose, it leads to a moderate compile time increase. There is no formal verification done on the implemented optimization pass. Because of these disadvantages it is a great tool to quickly implement, test and iterate optimizations, but for a more permanent passes, hand-written C++ code should be used. === Limitations @@ -446,9 +444,62 @@ Another thing that complicates DFA in languages like C is the usage of pointers Since inlining is required to perform rewrites, it can lead to bloating the executable and make it overly huge. == RQ2: Usage of dataflow analysis in current compilers -The Glasgow Haskell Compiler (GHC), LLVM, and GCC are good examples for compilers which already extensively use DFA to implement optimizations. +The Glasgow Haskell Compiler (GHC), LLVM, and GCC are good examples for compilers which already extensively use DFA to implement optimizations. The optimizations implemented by the analyzed papers are described in the following sections. These optimizations include common sub-expression elimination #cgy[@y-kildall_unified_1973 @y-tang_summary-based_2012 @y-reissmann_rvsdg_2020], copy propagation #cgy[@y-joisha_technique_2011 @y-tang_summary-based_2012], constant propagation @y-kildall_unified_1973, conditional branch elimination @y-rastislav_bodik_interprocedural_1997 and dead code elimination @y-reissmann_rvsdg_2020. +=== Copy propagation +Copy propagation is implemented in the work of Joisha, Schreiber, Banerjee, Boehm and Chakrabarti @y-joisha_technique_2011 with focus on making it possible to apply in multi-threaded environments. +It is implemented based on a procedural concurrency graph which is build from the source code. The nodes are all procedures which could run in the program. The edges between the nodes represent a MHP (may-happen-in-parallel) relation (@y-joisha_technique_2011, p. 627), which is a possible overlap of execution of both nodes. The function $I((p_1, p_2))$ lists the variables which the procedures $p_1$ and $p_2$ interfere. Interference in this context is a read and a write in overlapping (parallel) regions of the procedures. As long as there is no interfere between two function on a variable or the corresponding lock for a variable is held, it is possible to do copy propagation for the variable. +#figure( // copy_prop_rq2_example + caption: [Example for demonstrating copy propagation in multi-threaded programs, adapted from Fig. 1 and Fig. 2 in the work of Joisha, Schreiber, Banerjee, Boehm and Chakrabarti @y-joisha_technique_2011], + kind: "raw", + grid( + columns: (1fr, 1fr), + grid.cell(colspan: 2, align(center)[Global: `int X, Y; X = Y = 0; mutex my;` #v(1mm)]), + [ + #align(center, [Thread $t_1$]) + ```C + int a = X; + LOCK(my); + int b = Y; + print(a, b); + Y = b + 1; + UNLOCK(my); + ``` + ], + [ + #align(center, [Thread $t_2$]) + ```C + X = 1; + int a = Y; + X = a + 1; + LOCK(my); + Y = a + 1; + UNLOCK(my); + print(X, Y); + ``` + ] + ) +) +This technique can be explained based on @copy_prop_rq2_example. In thread $t_1$ there are two opportunities for applying copy propagation. The first is the variable `a` on line 1 can be propagated to the `print` in line 4, since no writes happen in this thread for the global variable `X`. The second is the variable `b` since access to the global variable `Y` is locked behind the mutex `my`. In thread $t_2$ copy propagation can not be performed, since the variable `a` reads from the global variable `Y` and it is not protected by locking the mutex `my`. This could result in `Y` being a different value on line 3 and line 5, because it is also written in $t_1$. + // TODO rewrite +/* += y-tang_summary-based_2012 +based on equal-reasoning +summary-based: + uses approximations for most things + based on two steps + 1 + points-to + side effects + make summary for procedure from above + runs as bottom-up cg traversal + 2 + propagates actual arguments + computes final points-to and side effects + runs as top-down cg traversal +*/ +// TODO mention DFA also used for compiler warnings/errors = Conclusion Our findings show that DFA is already extensively used in current compilers and brings big advantages for runtime speed. The cost of this is a higher compilation duration, which makes it unsuitable for JIT compilation. Furthermore, DFA allows complex optimizations across branches and function boundaries which would not be possible with traditional straight-line optimizations. \ @@ -476,6 +527,7 @@ The adaptability of LLVM and the associated immediate representation makes it an show heading: set text(weight: "regular") context { let slr_data = csv("pubs.csv") + let header = slr_data.at(0) let slr_data = slr_data.slice(1) let slr_bib = get-bibliography("y-") let key_map = (:) @@ -489,6 +541,7 @@ The adaptability of LLVM and the associated immediate representation makes it an table( columns: (auto, auto, auto, auto, auto, auto, 6em, 4.05em, auto, auto), inset: (x: 5pt, y: 3pt), + ..header, ..slr_data.flatten() ) } diff --git a/refs.bib b/refs.bib index 1a441a3..0c71c97 100644 --- a/refs.bib +++ b/refs.bib @@ -15,7 +15,6 @@ urldate = {2025-05-22}, date = {2019-06}, langid = {english}, - file = {PDF:/home/mutzi/Zotero/storage/56XR9EVM/Ciccozzi et al. - 2019 - Execution of UML models a systematic review of research and practice.pdf:application/pdf}, } @article{gotz_claimed_2021, @@ -34,7 +33,6 @@ urldate = {2025-05-22}, date = {2021-04}, langid = {english}, - file = {PDF:/home/mutzi/Zotero/storage/29BSZNTU/Götz et al. - 2021 - Claimed advantages and disadvantages of (dedicated) model transformation languages a systematic lit.pdf:application/pdf}, } @article{cytron_efficiently_1991, @@ -51,7 +49,6 @@ urldate = {2025-05-31}, date = {1991-10}, langid = {english}, - file = {PDF:/home/mutzi/Zotero/storage/BN26AU5Q/Cytron et al. - 1991 - Efficiently computing static single assignment form and the control dependence graph.pdf:application/pdf}, } @book{cooper_keith_d_engineering_2011, @@ -67,5 +64,17 @@ title = {Optimizing compiler - Wikipedia}, url = {https://en.wikipedia.org/wiki/Optimizing_compiler}, urldate = {2025-06-27}, - file = {Optimizing compiler - Wikipedia:/home/mutzi/Zotero/storage/GK9J35B4/Optimizing_compiler.html:text/html}, +} + +@online{copy_propagation_wikipedia, + title = {Copy propagation - Wikipedia}, + url = {https://en.wikipedia.org/wiki/Copy_propagation}, + urldate = {2025-07-08}, +} + +@inproceedings{rayside_pointsto_2005, + title = {{PointsTo} Analysis}, + url = {https://api.semanticscholar.org/CorpusID:14451904}, + author = {Rayside, Derek}, + date = {2005}, }