diff --git a/.vscode/settings.json b/.vscode/settings.json index 53ef732..02fcc82 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -2,6 +2,8 @@ "editor.wordWrap": "on", "cSpell.words": [ "interprocedurally", + "Regionalized", + "RVSDG", "subexpression", "Verilog", "VHDL", diff --git a/main.typ b/main.typ index 095d35d..f8d7daf 100644 --- a/main.typ +++ b/main.typ @@ -1,7 +1,7 @@ // cSpell:ignoreRegExp @[a-zA-Z0-9_-]+ // cSpell:ignoreRegExp #\w+\( // cSpell:ignore cetz booktbl bibliographyx citegroup instcombine -// cSpell:ignore Reissmann Soffa Zaidi Aigner Barany Mössenböck Duboscq Steinlechner Ginsbach Kildall Rastilav Joisha Rajiv Schreiber Banerjee Boehm Chakrabarti Löwe Edvinsson Peyton-Jones +// cSpell:ignore Reissmann Soffa Zaidi Aigner Barany Mössenböck Duboscq Steinlechner Ginsbach Kildall Rastilav Joisha Rajiv Schreiber Banerjee Boehm Chakrabarti Löwe Edvinsson Peyton-Jones Bahmann Själander // cSpell:disable #import "clean-acmart.typ": acmart #import "@preview/cetz:0.3.4" @@ -47,11 +47,10 @@ #set heading(supplement: "Sect.") #set figure(supplement: [Fig.]) -#show figure.caption: it => [ - #set text(size: 8pt) - *#it.supplement #context it.counter.display(it.numbering)* - #it.body -] +#show figure.caption: it => { + set text(size: 8pt) + v(1.5mm); strong(it.supplement + " " + context { it.counter.display(it.numbering)}); h(0.3em); it.body +} #show figure.where(kind: "raw"): set figure(supplement: [Listing]) #show figure.where(kind: "raw"): it => align(left, { v(8pt, weak: true); it.body; v(4pt, weak: true); it.caption; v(8pt, weak: true) }) #show figure: it => { v(1.25em, weak: true); it; v(1.25em, weak: true) } @@ -81,7 +80,7 @@ #set heading(numbering: "1.1.1") // cSpell:enable -// TODO Small info what is LLVM, LowLevelVM + footnote link +// TODO Small info what is LLVM + footnote link = Abstract Dataflow analysis is an important part of compiler optimization since it allows to eliminate or rewrite parts of the code with various techniques such as: constant propagation, dead code elimination, branch elimination. This work aims to look at the advantages and disadvantages of using dataflow analysis, how it is already used in current compilers, on which programming languages or immediate representations it operates and what limitations still exist. \ For this purpose we conducted a systematic literature in which we analyze 15 publications selected from 571 entries. Finally, following conclusions were drawn: dataflow analysis is used in many of todays popular compilers and the field is actively being researched. The advantages of dataflow analysis are huge for performance gain, but its implementations are complex and you need to be careful that the implementation does not change the program in an unwanted way. @@ -95,6 +94,8 @@ While this paper talks about dataflow analysis in the context of compiler optimi This work is divided into the following sections: in @background_c the background required to understand this work is given, in @methodology_c the methodology used to create this work is described, in @findings_c the contents of the papers are analyzed and evaluated, in @conclusion_c the results from this work are summarized. = Background +== LLVM #footnote[https://llvm.org/] +LLVM is a compiler framework mainly consisting of the LLVM Core, which serves as backend or optimizing LLVM IR and compiling it to machine specific code, and Clang, a frontend for compiling C, C++ and Objective-C to LLVM IR. There are more components like a debugger and a C and C++ standard library. There also exists a lot of external projects for compiling other languages and targeting other machines. The biggest advantage of LLVM is its flexibility and easy extensibility, which makes it a useful framework for most kinds of research about compilers. == Control flow graph #figure( // ssa_form_example caption: [C code and respective SSA in control flow graph form, adapted from Fig. 1 in the work of Reissmann, Meyer and Soffa @y-reissmann_rvsdg_2020], @@ -350,7 +351,7 @@ The work by Zaidi and Greaves @y-zaidi_value_2015 shows that dataflow analysis i // cSpell:enable } ) -As seen in @demographic_pub_year most of the analyzed publication are from the last 15 years, which indicates that this field is still actively being researched and explore, but research has already start back in 1983. Since research started over 50 years ago it indicates that this field is by now well-established. There are definitely more publications which are not listed here and not represented in this figure, but that is because the scope of this papers was very limited. \ +As seen in @demographic_pub_year most of the analyzed publication are from the last 15 years, which indicates that this field is still actively being researched and explore, but research has already start back in 1983. Since research started over 50 years ago it indicates that this field is by now well-established. There are definitely more publications which are not listed here and not represented in this figure, but that is because the scope of this papers is very limited. \ === Target languages #figure( // demographic_target_lang caption: "Target languages of the publications", @@ -389,7 +390,8 @@ As seen in @demographic_pub_year most of the analyzed publication are from the l }) } ) -@demographic_target_lang shows a 33% trend towards implementing DFA optimizations either with LLVM directly or by operating on the LLVM IR, while Java is either directly used as bytecode or as SSA representation of Java. This shows that LLVM is a good platform for implementing optimizations and that it has a lower barrier of entry for developing optimizations. +@demographic_target_lang shows a 33% trend towards implementing DFA optimizations either with LLVM directly or by operating on the LLVM IR, while Java is either directly used as bytecode or as SSA representation of Java. This shows that LLVM is a good platform for implementing optimizations and that it has a lower barrier of entry for developing optimizations. \ +The publication which does not target any language is by Kildall @y-kildall_unified_1973 and only talks about the theoretical ways to implement DFA. The publication by Ramsey, Dias and Peyton-Jones @y-ramsey_hoopl_2010 implements a custom IR to run the optimizations on in Haskell with a custom type for the different possible instructions. // TODO mention which pubs are in each category === Research focus #figure( // demographic_research_focus @@ -427,14 +429,17 @@ As seen in @demographic_pub_year most of the analyzed publication are from the l } ) The focus of the different papers can be seen in @demographic_research_focus. Most of the papers #cgy[@y-kildall_unified_1973 @y-rastislav_bodik_interprocedural_1997 @y-joisha_technique_2011 @y-tang_summary-based_2012 @y-pathade_path_2019] focus on creating and implementing new algorithms and techniques. Another big focus of the included papers is speeding up the analysis, which also makes it more viable for using in JIT compilers. While #cgy[@y-edvinsson_multi-threaded_2010 @y-edvinsson_parallel_2011] try to do this by simply running parts of the analysis on different threads, the work by Shi and Zhang @y-shi_pipelining_2020 tries to pipeline the analysis of functions and the work by Aigner, Barany and Mössenböck @y-aigner_lazy_2024 tries to skip parts by only lazily iterating over nodes of the IR. In the works of Duboscq et al., Zaidi et al., Reissmann et al. #cgy[@y-duboscq_graal_2013 @y-zaidi_value_2015 @y-reissmann_rvsdg_2020] a custom IR is implemented to make it easier to run parts of the DFA or to have a better structure then the previous code or IR. The focus of the work by Ramsey et al. @y-ramsey_hoopl_2010 is to provide a generic library for implementing DFA and using it and the work of Urban and Steinlechner @y-urban_implementing_2013 is to provide an example implementation of the library to show how it works. The work by Ginsbach, Crawford and O'Boyle @y-ginsbach_candl_2018 defines and implements a compiler for a Domain-Specific Language (DSL) defined in the paper to implement DFA algorithms in the LLVM framework to make it easier for researchers to try out new ideas and implement them. + == RQ1: Advantages and disadvantages of using Dataflow analysis for compiler optimization DFA makes many big compiler optimizations possible but it also brings many trade-offs and not just for performance. These optimizations eliminate unused code and simplify expressions, which reduces execution time and memory footprint during runtime. The work by Kildall @y-kildall_unified_1973 is one of the first to talk about DFA and how it allows to use previously existing optimizations, which could only be applied on code sections without branches, with branching by checking how data flows through the branches. Later publications by Rastilav et al. and Joisha et al. #cgy[@y-rastislav_bodik_interprocedural_1997 @y-joisha_technique_2011] describe ways to apply these optimization interprocedurally and across thread synchronization boundaries. The work by Rastilav, Rajiv and Soffa @y-rastislav_bodik_interprocedural_1997 does this be inlining the called procedure and then performing dataflow analysis. This makes every procedure call optimized for every call location, but brings the disadvantage of very rapidly increasing the size of the optimized program. An important requirement that the work by Joisha, Schreiber, Banerjee, Boehm and Chakrabarti @y-joisha_technique_2011 describes, is that programs must be well synchronized, otherwise DFA can not be used because of possible data races. - === Analysis performance -While performance is not the biggest concern for DFA, since it runs at compile-time and accuracy is more important as described in the work by Edvinsson and Löwe @y-edvinsson_multi-threaded_2010. Many publications #cgy[@y-edvinsson_multi-threaded_2010 @y-edvinsson_parallel_2011 @y-shi_pipelining_2020 @y-aigner_lazy_2024] have investigated how to improve the performance of DFA. This is done with several techniques: In both publications by Edvinsson, Löwe and Lundberg #cgy[@y-edvinsson_multi-threaded_2010 @y-edvinsson_parallel_2011] different function calls are run on different threads, but it has the problem of creating and queue a task for each function, which can lead to a big overhead. In later work by Edvinsson, Löwe and Lundberg @y-edvinsson_parallel_2011 independent branches are also run on separate threads. A big problem with both approaches is to avoid, that some functions could be queued for analysis be more than one thread, which leads to unnecessary redundancy. \ // TODO explain detailed how function are selected +While performance is not the biggest concern for DFA, since it runs at compile-time and accuracy is more important as described in the work by Edvinsson and Löwe @y-edvinsson_multi-threaded_2010. Many publications #cgy[@y-edvinsson_multi-threaded_2010 @y-edvinsson_parallel_2011 @y-shi_pipelining_2020 @y-aigner_lazy_2024] have investigated how to improve the performance of DFA. This is done with several techniques described next. \ +In both publications by Edvinsson, Löwe and Lundberg #cgy[@y-edvinsson_multi-threaded_2010 @y-edvinsson_parallel_2011] different function calls are run on different threads, but it has the problem of creating and queue a task for each function, which can lead to a big overhead. +The later work by Edvinsson, Löwe and Lundberg @y-edvinsson_parallel_2011 expands upon the first work by Edvinsson and Löwe @y-edvinsson_multi-threaded_2010 and both detail how just running every SSA-node on a new thread leads to almost no speed-up, since must SSA-nodes are sequentially dependant on each other and it also leads to thread-switch and synchronization overhead. To solve this problem they describe a way to cluster SSA-nodes, which are independent from other clusters. SSA-nodes are independent from one another if they are not reachable in a SSA-graph, they are in different function and they are analyzed in a different context. They also want to avoid dependencies between clusters, because then it is necessary to perform synchronization between the threads which operate on those clusters. Their algorithm allows that nodes may be assigned to multiple clusters, but then the analysis result leads to dependencies between the clusters because the result must be shared. The algorithm also allows that independent branches (i.e. the _true_ and the _false_ branch of an if) are processed independently but after calculating both branches the results must be merged together in one thread, which leads to synchronization. They also process all potential call targets of a method in parallel. Both of those can lead to redundancy because the already queued nodes can still be reached through other nodes which are called by these nodes. To avoid the overhead for analyzing small methods separately a threshold is added to indicate when it is worth to perform the task on a separate thread. This threshold is calculated by comparing the number of methods reachable from the to-be-analyzed method to a fixed number. This fixed number was determined experimentally. They also try to reduce redundancy by checking the set of methods which would be called by the new task against the sets of methods called by the currently running and waiting tasks. When the overlap between those sets is smaller than a fixed threshold, which was also determined experimentally, the new task is queued. The results show a maximum speed-up of up to 2.43 with and an average of 1.71. A problem with this approach is, that the fixed numbers for thresholds differ between program, so they must be newly determined every time. In some cases there is no speed-up for the analysis of a program. \ +// TODO explain detailed how function are selected Another approach described in the work by Shi and Zhang @y-shi_pipelining_2020 is to pipeline the function calls. This is done by analyzing all variables, which do not depend on any function calls. When the function calls have finished being analyzed, the variables, which depend on that function call are analyzed. Thereby more parallel work is possible. === Implementation complexity Another problem with DFA is the difficulty to implement optimizations with it, which is explained in the works by Ramsey et al. and Ginsbach et al. #cgy[@y-ramsey_hoopl_2010 @y-ginsbach_candl_2018]. DFA is often also deeply entangled with the compiler internals, which makes it difficult to reuse existing optimizations with other compilers or implement new optimizations quickly and it is complicated to implemented, as seen in LLVM: "simple peephole optimizations in the LLVM instcombine pass contain approximately 30000 lines of complex C++ code, despite the transformations being simple" (@y-ginsbach_candl_2018, p. 151) \ @@ -442,7 +447,7 @@ One solutions to this problem is described in the work by Ramsey, Dias and Peyto Another approach is described by the work of Ginsbach, Crawford and O'Boyle @y-ginsbach_candl_2018 by creating a domain specific language to implement LLVM optimization passes. This is done by a having a simple language for directly implementing the logic of the optimization, while a custom transpiler then converts it into a LLVM pass written in C++. Since the LLVM pass is implemented in a more generic way to fit this purpose, it leads to a moderate compile time increase. There is no formal verification done on the implemented optimization pass. Because of these disadvantages it is a great tool to quickly implement, test and iterate optimizations, but for a more permanent passes, hand-written C++ code should be used. === Limitations DFA is hard to parallelize because variables are often dependant on other variables or function arguments. While it is possible to analyze multiple functions at the surface level, they still depend on the context of other functions calling it. As already mentioned in the work by Shi and Zhang @y-shi_pipelining_2020, it is still possible to run parallel analysis while still waiting for the results of other threads. \ -Global variables also make analysis more complicated since the can be accessed and modified by all functions and either need to be treated as an unknown value every time or all functions which work with this variable are analytically dependant on each other and should be locked at when checking the value of the variable. A similar problem exists for variables shared across threads, because the analysis has to look at all functions which could modify the variable. Alternatively the variable should be well synchronized so that only one thread can write it or multiple threads can read it, but not both options at the same time as described in the work by Joisha, Schreiber, Banerjee, Boehm and Chakrabarti @y-joisha_technique_2011. \ +Global variables also make analysis more complicated since the can be accessed and modified by all functions and either need to be treated as an unknown value every time or all functions which work with this variable are analytically dependant on each other and should be locked at when checking the value of the variable. A similar problem exists for variables shared across threads, because the analysis has to look at all functions which could modify the variable. As described in the work by Joisha, Schreiber, Banerjee, Boehm and Chakrabarti @y-joisha_technique_2011 an alternative would be that the variable are well synchronized so that only one thread can write it or multiple threads can read it, but not both options at the same time. \ Another thing that complicates DFA in languages like C is the usage of pointers because they allow the program to modify all variables in unpredictable ways which thereby invalidates all facts and assumptions which were made up to that point about all variables. \ Since inlining is required to perform rewrites, it can lead to bloating the executable and make it overly huge. @@ -491,8 +496,32 @@ It is implemented based on a procedural concurrency graph which is build from th ) ) This technique can be explained based on @copy_prop_rq2_example. In thread $t_1$ there are two opportunities for applying copy propagation. The first is the variable `a` on line 1 can be propagated to the `print` in line 4, since no writes happen in this thread for the global variable `X`. The second is the variable `b` since access to the global variable `Y` is locked behind the mutex `my`. In thread $t_2$ copy propagation can not be performed, since the variable `a` reads from the global variable `Y` and it is not protected by locking the mutex `my`. This could result in `Y` being a different value on line 3 and line 5, because it is also written in $t_1$. - -// TODO rewrite +=== Regionalized Value State Dependence Graph (RVSDG) +#place( // rvsdg_example + bottom + center, + scope: "parent", + float: true, + [ + #figure( + caption: [Example how RVSDG, taken from Fig. 1 of the work by Reissmann, Meyer, Bahmann and Själander @y-reissmann_rvsdg_2020], + grid( + columns: (1fr,)*4, + column-gutter: 0.5em, + image("rvsdg_1_code.svg"), + image("rvsdg_2_cfg.svg"), + image("rvsdg_3_uir.svg"), + image("rvsdg_4_oir.svg") + ) + ) + ] +) +In the work by Reissmann, Meyer, Bahmann and Själander @y-reissmann_rvsdg_2020 they describe a new IR for optimizing compilers to make optimizations easier and simpler to implement. Based on this IR they implemented their own compiler, jlm #footnote[https://github.com/phate/jlm], which consumed and outputted LLVM IR and was able to reach almost the same speed-up as LLVMs own optimizer. \ +RVSDG is implemented as a tree structure consisting of nested regions. A region only represents a collection of nodes and edges, with input connections and output connections. The edges in a region are either between nodes themselves and between nodes and either input or output of the region. \ +The nodes can either be primitive operations like arithmetics, memory operations and function calls. The other type are structural nodes where each represent a different construct: Gamma (#sym.gamma) model branching statements like `if` or `switch`. Theta (#sym.theta) models tail-controlled loops, for head-controlled loops a combination of a gamma node and a theta node is used. Lambda (#sym.lambda) models a function with multiple inputs connections representing arguments and a single output connection for the function result. Delta (#sym.delta) models global variable with its inputs representing dependencies on external variables and its output represents its actual value. Phi (#sym.Phi) is used for recursive functions and contain a single #sym.lambda node representing the actual function, its inner input connection is connected to the lambda node, which then can use it for calling itself again, thereby allowing RVSDG to represent recursive calls without cycles. Omega (#sym.omega) represents a translation unit and is the top-level node of an RVSDG and has not input or output connections. +// TODO maybe explain nodes in/out better +// TODO explain edges +// TODO explain optimizations +// TODO explain example = Conclusion Our findings show that DFA is already extensively used in current compilers and brings big advantages for runtime speed. The cost of this is a higher compilation duration, which makes it unsuitable for JIT compilation. Furthermore, DFA allows complex optimizations across branches and function boundaries which would not be possible with traditional straight-line optimizations. \ @@ -512,7 +541,7 @@ The adaptability of LLVM and the associated immediate representation makes it an #counter(heading).update(0) #{ // slr results table - set page(flipped: true, columns: 1, margin: 1.75em) + set page(flipped: true, columns: 1, margin: (x: 1.75em, y: 3em)) [= SLR Results] v(1em) counter(heading).update(0) diff --git a/rvsdg_1_code.svg b/rvsdg_1_code.svg new file mode 100644 index 0000000..0a9bd46 --- /dev/null +++ b/rvsdg_1_code.svg @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/rvsdg_2_cfg.svg b/rvsdg_2_cfg.svg new file mode 100644 index 0000000..1db754f --- /dev/null +++ b/rvsdg_2_cfg.svg @@ -0,0 +1,749 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/rvsdg_3_uir.svg b/rvsdg_3_uir.svg new file mode 100644 index 0000000..a342fd5 --- /dev/null +++ b/rvsdg_3_uir.svg @@ -0,0 +1,1379 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/rvsdg_4_oir.svg b/rvsdg_4_oir.svg new file mode 100644 index 0000000..8a0f67f --- /dev/null +++ b/rvsdg_4_oir.svg @@ -0,0 +1,1171 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +