@article{Zhang1474896, author = {Zhang, Long and Morin, Brice and Haller, Philipp and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {SINTEF Digital, Oslo, Norway}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20230630}, number = {11}, pages = {2534--2548}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {A Chaos Engineering System for Live Analysis and Falsification of Exception-handling in the JVM}, volume = {47}, DOI = {10.1109/TSE.2019.2954871}, keywords = {dynamic analysis, exception-handling, production systems, chaos engineering}, abstract = {Software systems contain resilience code to handle those failures and unexpected events happening in production. It is essential for developers to understand and assess the resilience of their systems. Chaos engineering is a technology that aims at assessing resilience and uncovering weaknesses by actively injecting perturbations in production. In this paper, we propose a novel design and implementation of a chaos engineering system in Java called ChaosMachine. It provides a unique and actionable analysis on exception-handling capabilities in production, at the level of try-catch blocks. To evaluate our approach, we have deployed ChaosMachine on top of 3 large-scale and well-known Java applications totaling 630k lines of code. Our results show that ChaosMachine reveals both strengths and weaknesses of the resilience code of a software system at the level of exception handling. }, year = {2021} } @unpublished{Ye1268031, author = {Ye, He and Martinez, Matias and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20181206}, title = {A Comprehensive Study of Automatic Program Repair on the QuixBugs Benchmark}, abstract = {Automatic program repair papers tend to repeatedly use the same benchmarks. This poses a threat to the external validity of the findings of the program repair research community. In this paper, we perform an automatic repair experiment on a benchmark called QuixBugs that has been recently published. This benchmark has never been studied in the context of program repair. In this study, we report on the characteristics of QuixBugs, and we design and perform an experiment about the effectiveness of test-suite based program repair on QuixBugs. We study two repair systems, Astor and Nopol, which are representatives of generate-and-validate repair technique and synthesis repair technique respectively. We propose three patch correctness assessment techniques to comprehensively study overfitting and incorrect patches. Our key results are: 1) 13/40 buggy programs in the QuixBugs can be repaired with a test-suite adequate patch; 2) a total of 22 different plausible patches for those 13 buggy programs in the QuixBugs are present in the search space of the considered tools; 3) the three patch assessment techniques discard in total 12/22 patches that are overfitting. This sets a baseline for future research of automatic repair on QuixBugs. Our experiment also highlights the major properties and challenges of how to perform automated correctness assessment of program repair patches. All experimental results are publicly available on Github in order to facilitate future research on automatic program repair. }, year = {} } @inproceedings{Ye1782822, author = {Ye, He and Martinez, Matias and Durieux, Thomas and Monperrus, Martin}, booktitle = {IBF 2019 : 2019 IEEE 1st International Workshop on Intelligent Bug Fixing}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, note = {Part of ISBN 9781728118093QC 20230921}, pages = {1--10}, eid = {8665475}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {A Comprehensive Study of Automatic Program Repair on the QuixBugs Benchmark}, DOI = {10.1109/IBF.2019.8665475}, abstract = {Automatic program repair papers tend to repeatedly use the same benchmarks. This poses a threat to the external validity of the findings of the program repair research community. In this paper, we perform an automatic repair experiment on a benchmark called QuixBugs that has never been studied in the context of program repair. In this study, we report on the characteristics of QuixBugs, and study five repair systems, Arja, Astor, Nopol, NPEfix and RSRepair, which are representatives of generate-and-validate repair techniques and synthesis repair techniques. We propose three patch correctness assessment techniques to comprehensively study overfitting and incorrect patches. Our key results are: 1) 15 / 40 buggy programs in the QuixBugs can be repaired with a test-suite adequate patch; 2) a total of 64 plausible patches for those 15 buggy programs in the QuixBugs are present in the search space of the considered tools; 3) the three patch assessment techniques discard in total 33 / 64 patches that are overfitting. This sets a baseline for future research of automatic repair on QuixBugs. Our experiment also highlights the major properties and challenges of how to perform automated correctness assessment of program repair patches. All experimental results are publicly available on Github in order to facilitate future research on automatic program repair. }, year = {2019} } @article{Ye1506248, author = {Ye, He and Martinez, Matias and Durieux, Thomas and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {Université Polytechnique Hauts-de-France, France}, journal = {Journal of Systems and Software}, note = {QC 20201202}, eid = {110825}, publisher = {Elsevier Inc.}, title = {A comprehensive study of automatic program repair on the QuixBugs benchmark}, volume = {171}, DOI = {10.1016/j.jss.2020.110825}, keywords = {Automatic program repair, Bug benchmark, Patch correctness assessment, Software engineering, Assessment technique, Automatic programs, Empirical studies, External validities, Overfitting, Repair tools, Research communities, Automatic test pattern generation}, abstract = {Automatic program repair papers tend to repeatedly use the same benchmarks. This poses a threat to the external validity of the findings of the program repair research community. In this paper, we perform an empirical study of automatic repair on a benchmark of bugs called QuixBugs, which has been little studied. In this paper, (1) We report on the characteristics of QuixBugs; (2) We study the effectiveness of 10 program repair tools on it; (3) We apply three patch correctness assessment techniques to comprehensively study the presence of overfitting patches in QuixBugs. Our key results are: (1) 16/40 buggy programs in QuixBugs can be repaired with at least a test suite adequate patch; (2) A total of 338 plausible patches are generated on the QuixBugs by the considered tools, and 53.3% of them are overfitting patches according to our manual assessment; (3) The three automated patch correctness assessment techniques, RGTEvosuite, RGTInputSampling and GTInvariants, achieve an accuracy of 98.2%, 80.8% and 58.3% in overfitting detection, respectively. To our knowledge, this is the largest empirical study of automatic repair on QuixBugs, combining both quantitative and qualitative insights. All our empirical results are publicly available on GitHub in order to facilitate future research on automatic program repair.  }, year = {2021} } @article{SotoValero1547362, author = {Soto Valero, Cesar and Harrand, Nicolas and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {Empirical Software Engineering}, note = {QC 20210519}, number = {3}, eid = {45}, publisher = {Springer Nature}, title = {A comprehensive study of bloated dependencies in the Maven ecosystem}, volume = {26}, DOI = {10.1007/s10664-020-09914-8}, keywords = {Dependency management, Software reuse, Debloating, Program analysis}, abstract = {Build automation tools and package managers have a profound influence on software development. They facilitate the reuse of third-party libraries, support a clear separation between the application's code and its external dependencies, and automate several software development tasks. However, the wide adoption of these tools introduces new challenges related to dependency management. In this paper, we propose an original study of one such challenge: the emergence of bloated dependencies. Bloated dependencies are libraries that are packaged with the application's compiled code but that are actually not necessary to build and run the application. They artificially grow the size of the built binary and increase maintenance effort. We propose DepClean, a tool to determine the presence of bloated dependencies in Maven artifacts. We analyze 9,639 Java artifacts hosted on Maven Central, which include a total of 723,444 dependency relationships. Our key result is as follows: 2.7% of the dependencies directly declared are bloated, 15.4% of the inherited dependencies are bloated, and 57% of the transitive dependencies of the studied artifacts are bloated. In other words, it is feasible to reduce the number of dependencies of Maven artifacts to 1/4 of its current count. Our qualitative assessment with 30 notable open-source projects indicates that developers pay attention to their dependencies when they are notified of the problem. They are willing to remove bloated dependencies: 21/26 answered pull requests were accepted and merged by developers, removing 140 dependencies in total: 75 direct and 65 transitive. }, year = {2021} } @article{Ginelli1660543, author = {Ginelli, Davide and Martinez, Matias and Mariani, Leonardo and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Milano Bicocca, Milan, Italy.}, institution = {Univ Polytech Hauts de France, Valenciennes, France.}, institution = {Univ Milano Bicocca, Milan, Italy.}, journal = {Empirical Software Engineering}, note = {QC 20220817}, number = {4}, eid = {97}, publisher = {Springer Nature}, title = {A comprehensive study of code-removal patches in automated program repair}, volume = {27}, DOI = {10.1007/s10664-021-10100-7}, keywords = {Automatic program repair, Code-removal patches, Software testing, Debugging}, abstract = {Automatic Program Repair (APR) techniques can promisingly help reduce the cost of debugging. Many relevant APR techniques follow the generate-and-validate approach, that is, the faulty program is iteratively modified with different change operators and then validated with a test suite until a plausible patch is generated. In particular, Kali is a generate-and-validate technique developed to investigate the possibility of generating plausible patches by only removing code. Former studies show that indeed Kali successfully addressed several faults. This paper addresses the single and particular case of code-removal patches in automated program repair. We investigate the reasons and the scenarios that make their creation possible, and the relationship with patches implemented by developers. Our study reveals that code-removal patches are often insufficient to fix bugs, and proposes a comprehensive taxonomy of code-removal patches that provides evidence of the problems that may affect test suites, opening new opportunities for researchers in the field of automatic program repair. }, year = {2022} } @article{Vera-Perez1348179, author = {Vera-Perez, Oscar Luis and Danglot, Benjamin and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {Inria Rennes Bretagne Atlantique, Campus Beaulieu 263 Ave Gen Leclerc, F-35042 Rennes, France.}, institution = {Inria Lille Nord Europe, Parc Sci Haute Borne 40,Ave Halley Bat A Pk Plaza, F-59650 Villeneuve Dascq, France.}, journal = {Empirical Software Engineering}, note = {QC 20190903}, number = {3}, pages = {1195--1225}, title = {A comprehensive study of pseudo-tested methods}, volume = {24}, DOI = {10.1007/s10664-018-9653-2}, keywords = {Software testing, Software developers, Pseudo-tested methods, Test quality, Program analysis}, abstract = {Pseudo-tested methods are defined as follows: they are covered by the test suite, yet no test case fails when the method body is removed, i.e., when all the effects of this method are suppressed. This intriguing concept was coined in 2016, by Niedermayr and colleagues, who showed that such methods are systematically present, even in well-tested projects with high statement coverage. This work presents a novel analysis of pseudo-tested methods. First, we run a replication of Niedermayr's study with 28K+ methods, enhancing its external validity thanks to the use of new tools and new study subjects. Second, we perform a systematic characterization of these methods, both quantitatively and qualitatively with an extensive manual analysis of 101 pseudo-tested methods. The first part of the study confirms Niedermayr's results: pseudo-tested methods exist in all our subjects. Our in-depth characterization of pseudo-tested methods leads to two key insights: pseudo-tested methods are significantly less tested than the other methods; yet, for most of them, the developers would not pay the testing price to fix this situation. This calls for future work on targeted test generation to specify those pseudo-tested methods without spending developer time. }, year = {2019} } @inproceedings{Papoudakis1167092, author = {Papoudakis, G. and Preux, P. and Monperrus, Martin}, booktitle = {6th International Conference on Complex Networks and Their Applications, Complex Networks 2017 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Université de Lille, CRIStAL & Inria, Villeneuve d’Ascq, France}, institution = {Université de Lille, CRIStAL & Inria, Villeneuve d’Ascq, France}, note = {Part of ISBN 9783319721491QC 20171218}, pages = {531--542}, title = {A generative model for sparse, evolving digraphs}, series = {Studies in Computational Intelligence}, number = {689}, volume = {689}, DOI = {10.1007/978-3-319-72150-7_43}, abstract = {Generating graphs that are similar to real ones is an open problem, while the similarity notion is quite elusive and hard to formalize. In this paper, we focus on sparse digraphs and propose SDG, an algorithm that aims at generating graphs similar to real ones. Since real graphs are evolving and this evolution is important to study in order to understand the underlying dynamical system, we tackle the problem of generating series of graphs. We propose SEDGE, an algorithm meant to generate series of graphs similar to a real series. SEDGE is an extension of SDG. We consider graphs that are representations of software programs and show experimentally that our approach outperforms other existing approaches. Experiments show the performance of both algorithms. }, year = {2018} } @article{Harrand1372273, author = {Harrand, Nicolas and Allier, Simon and Rodriguez-Cancio, Marcelino and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {DGA, Val De Reuil, France.}, institution = {Vanderbildt Univ, Nashville, TN USA.}, journal = {Genetic Programming and Evolvable Machines}, note = {QC 20191122}, number = {4}, pages = {531--580}, title = {A journey among Java neutral program variants}, volume = {20}, DOI = {10.1007/s10710-019-09355-3}, keywords = {Neutral program variant, Program transformation, Java, Code plasticity}, abstract = {Neutral program variants are alternative implementations of a program, yet equivalent with respect to the test suite. Techniques such as approximate computing or genetic improvement share the intuition that potential for enhancements lies in these acceptable behavioral differences (e.g., enhanced performance or reliability). Yet, the automatic synthesis of neutral program variants, through program transformations remains a key challenge. This work aims at characterizing plastic code regions in Java programs, i.e., the code regions that are modifiable while maintaining functional correctness, according to a test suite. Our empirical study relies on automatic variations of 6 real-world Java programs. First, we transform these programs with three state-of-the-art program transformations: add, replace and delete statements. We get a pool of 23,445 neutral variants, from which we gather the following novel insights: developers naturally write code that supports fine-grain behavioral changes; statement deletion is a surprisingly effective program transformation; high-level design decisions, such as the choice of a data structure, are natural points that can evolve while keeping functionality. Second, we design 3 novel program transformations, targeted at specific plastic regions. New experiments reveal that respectively 60%, 58% and 73% of the synthesized variants (175,688 in total) are neutral and exhibit execution traces that are different from the original. }, year = {2019} } @article{Danglot1363140, author = {Danglot, Benjamin and Vera-Perez, Oscar and Yu, Zhongxing and Zaidman, Andy and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {INRIA, Lille, France.}, institution = {INRIA, Rennes, France.}, institution = {Delft Univ Technol, Delft, Netherlands.}, journal = {Journal of Systems and Software}, note = {QC 20191022}, eid = {UNSP 110398}, title = {A snowballing literature study on test amplification}, volume = {157}, DOI = {10.1016/j.jss.2019.110398}, keywords = {Test amplification, Test augmentation, Test optimization, Test regeneration, Automatic testing}, abstract = {The adoption of agile approaches has put an increased emphasis on testing, resulting in extensive test suites. These suites include a large number of tests, in which developers embed knowledge about meaningful input data and expected properties as oracles. This article surveys works that exploit this knowledge to enhance manually written tests with respect to an engineering goal (e.g., improve coverage or refine fault localization). While these works rely on various techniques and address various goals, we believe they form an emerging and coherent field of research, which we coin "test amplification". We devised a first set of papers from DBLP, searching for all papers containing "test" and "amplification" in their title. We reviewed the 70 papers in this set and selected the 4 papers that fit the definition of test amplification. We use them as the seeds for our snowballing study, and systematically followed the citation graph. This study is the first that draws a comprehensive picture of the different engineering goals proposed in the literature for test amplification. We believe that this survey will help researchers and practitioners entering this new field to understand more quickly and more deeply the intuitions, concepts and techniques used for test amplification. }, year = {2019} } @article{Baudry1583272, author = {Baudry, Benoit and Chen, Zimin and Etemadi, Khashayar and Fu, Han and Ginelli, Davide and Kommrusch, Steve and Martinez, Matias and Monperrus, Martin and Ron Arteaga, Javier and Ye, He and Yu, Zhongxing}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Milano Bicocca, Comp Sci, I-20166 Milan, Italy.}, institution = {Colorado State Univ, Machine Learning, Ft Collins, CO 80523 USA.}, institution = {Univ Polytech Hauts De France, F-59260 Valenciennes, France.}, institution = {KTH Royal Inst Technol, Software Engn, S-11428 Stockholm, Sweden.}, institution = {Shandong Univ, Sch Comp Sci & Technol, Jinan 266237, Peoples R China.}, journal = {IEEE Software}, note = {QC 20210805}, number = {4}, pages = {28--35}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {A Software-Repair Robot Based on Continual Learning}, volume = {38}, DOI = {10.1109/MS.2021.3070743}, keywords = {Maintenance engineering, Computer bugs, Software development management, Bot (Internet), Training data, Machine learning}, abstract = {Software bugs are common, and correcting them accounts for a significant portion of the costs in the software development and maintenance process. In this article, we discuss R-Hero, our novel system for learning how to fix bugs based on continual training. }, year = {2021} } @article{Yu1295975, author = {Yu, Zhongxing and Martinez, Matias and Danglot, Benjamin and Durieux, Thomas and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Inria Lille Nord Europe, Ave Halley, F-59650 Villeneuve Dascq, France.}, institution = {Univ Valenciennes, Malvache Bldg,Campus Mont Houy, F-59313 Valenciennes 9, France.}, institution = {Inria Lille Nord Europe, Ave Halley, F-59650 Villeneuve Dascq, France.}, institution = {Inria Lille Nord Europe, Ave Halley, F-59650 Villeneuve Dascq, France.}, journal = {Empirical Software Engineering}, note = {QC 20190611}, number = {1}, pages = {33--67}, publisher = {SPRINGER}, title = {Alleviating patch overfitting with automatic test generation : a study of feasibility and effectiveness for the Nopol repair system}, volume = {24}, DOI = {10.1007/s10664-018-9619-4}, keywords = {Program repair, Synthesis-based repair, Patch overfitting, Automatic test case generation}, abstract = {Among the many different kinds of program repair techniques, one widely studied family of techniques is called test suite based repair. However, test suites are in essence input-output specifications and are thus typically inadequate for completely specifying the expected behavior of the program under repair. Consequently, the patches generated by test suite based repair techniques can just overfit to the used test suite, and fail to generalize to other tests. We deeply analyze the overfitting problem in program repair and give a classification of this problem. This classification will help the community to better understand and design techniques to defeat the overfitting problem. We further propose and evaluate an approach called UnsatGuided, which aims to alleviate the overfitting problem for synthesis-based repair techniques with automatic test case generation. The approach uses additional automatically generated tests to strengthen the repair constraint used by synthesis-based repair techniques. We analyze the effectiveness of UnsatGuided: 1) analytically with respect to alleviating two different kinds of overfitting issues; 2) empirically based on an experiment over the 224 bugs of the Defects4J repository. The main result is that automatic test generation is effective in alleviating one kind of overfitting, issue-regression introduction, but due to oracle problem, has minimal positive impact on alleviating the other kind of overfitting issue-incomplete fixing. }, year = {2019} } @inproceedings{Durieux1430140, author = {Durieux, T. and Abreu, R. and Monperrus, Martin and Bissyande, T. F. and Cruz, L.}, booktitle = {Proceedings - 2019 IEEE International Conference on Software Maintenance and Evolution, ICSME 2019 : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20200513Part of ISBN 9781728130941}, pages = {291--295}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {An Analysis of 35+ Million Jobs of Travis CI}, DOI = {10.1109/ICSME.2019.00044}, keywords = {continuous integration, continuous integration usage, TravisCI, Computer programming, Computer software maintenance, Continuous integrations, Corporate users, MicroSoft, Open source developers, Run test, Open source software}, abstract = {Travis CI handles automatically thousands of builds every day to, amongst other things, provide valuable feedback to thousands of open-source developers. In this paper, we investigate Travis CI to firstly understand who is using it, and when they start to use it. Secondly, we investigate how the developers use Travis CI and finally, how frequently the developers change the Travis CI configurations. We observed during our analysis that the main users of Travis CI are corporate users such as Microsoft. And the programming languages used in Travis CI by those users do not follow the same popularity trend than on GitHub, for example, Python is the most popular language on Travis CI, but it is only the third one on GitHub. We also observe that Travis CI is set up on average seven days after the creation of the repository and the jobs are still mainly used (60%) to run tests. And finally, we observe that 7.34% of the commits modify the Travis CI configuration. We share the biggest benchmark of Travis CI jobs (to our knowledge): It contains 35,793,144 jobs from 272,917 different GitHub projects. }, year = {2019} } @article{Danglot1452687, author = {Danglot, Benjamin and Monperrus, Martin and Rudametkin, Walter and Baudry, Benoit}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {INRIA, Lille-Nord Europe, 40 Avenue Halley, Villeneuve d’Ascq, 59650, France}, institution = {Université de Lille, 42 rue Paul Duez, 59000, Lille, France}, journal = {Empirical Software Engineering}, note = {QC 20200707}, number = {4}, pages = {2379--2415}, publisher = {Springer Nature}, title = {An approach and benchmark to detect behavioral changes of commits in continuous integration}, volume = {25}, DOI = {10.1007/s10664-019-09794-7}, keywords = {Behavioral change detection, Continuous Integration, Test amplification, Integration, Open source software, Testing, Behavioral changes, Continuous integrations, Development process, Fully automated, Generating variations, Good practices, Search-based, Test amplifications, Software testing}, abstract = {When a developer pushes a change to an application’s codebase, a good practice is to have a test case specifying this behavioral change. Thanks to continuous integration (CI), the test is run on subsequent commits to check that they do no introduce a regression for that behavior. In this paper, we propose an approach that detects behavioral changes in commits. As input, it takes a program, its test suite, and a commit. Its output is a set of test methods that capture the behavioral difference between the pre-commit and post-commit versions of the program. We call our approach DCI (Detecting behavioral changes in CI). It works by generating variations of the existing test cases through (i) assertion amplification and (ii) a search-based exploration of the input space. We evaluate our approach on a curated set of 60 commits from 6 open source Java projects. To our knowledge, this is the first ever curated dataset of real-world behavioral changes. Our evaluation shows that DCI is able to generate test methods that detect behavioral changes. Our approach is fully automated and can be integrated into current development processes. The main limitations are that it targets unit tests and works on a relatively small fraction of commits. More specifically, DCI works on commits that have a unit test that already executes the modified code. In practice, from our benchmark projects, we found 15.29% of commits to meet the conditions required by DCI. }, URL = {https://link.springer.com/article/10.1007%2Fs10664-019-09794-7}, year = {2020} } @article{Martinez1300612, author = {Martinez, M. and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {Journal of Systems and Software}, note = {QC 20190329}, pages = {65--80}, title = {Astor : Exploring the design space of generate-and-validate program repair beyond GenProg}, volume = {151}, DOI = {10.1016/j.jss.2019.01.069}, keywords = {Automated Program Repair, Defects, Evaluation Frameworks, Software Bugs, Software Maintenance, Software Testing}, abstract = {This article contributes to defining the design space of program repair. Repair approaches can be loosely characterized according to the main design philosophy, in particular “generate- and-validate” and synthesis-based approaches. Each of those repair approaches is a point in the design space of program repair. Our goal is to facilitate the design, development and evaluation of repair approaches by providing a framework that: a) contains components commonly present in most approaches, b) provides built-in implementations of existing repair approaches. This paper presents a Java framework named Astor that focuses on the design space of generate-and-validate repair approaches. The key novelty of Astor is to provides explicit extension points to explore the design space of program repair. Thanks to those extension points, researchers can both reuse existing program repair components and implement new ones. Astor includes 6 unique implementations of repair approaches in Java, including GenProg for Java called jGenProg. Researchers have already defined new approaches over Astor. The implementations of program repair approaches built already available in Astor are capable of repairing, in total, 98 real bugs from 5 large Java programs. Astor code is publicly available on Github: https://github.com/SpoonLabs/astor. }, year = {2019} } @article{Etemadi1878189, author = {Etemadi, Khashayar and Sharma, Aman and Madeiral, Fernanda and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Vrije Universiteit Amsterdam, Amsterdam, HV, The Netherlands, 1081, HV}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20240701}, number = {11}, pages = {4988--5007}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Augmenting Diffs With Runtime Information}, volume = {49}, DOI = {10.1109/TSE.2023.3324258}, keywords = {Code diff, code review, dynamic program analysis, runtime differencing}, abstract = {Source code diffs are used on a daily basis as part of code review, inspection, and auditing. To facilitate understanding, they are typically accompanied by explanations that describe the essence of what is changed in the program. As manually crafting high-quality explanations is a cumbersome task, researchers have proposed automatic techniques to generate code diff explanations. Existing explanation generation methods solely focus on static analysis, i.e., they do not take advantage of runtime information to explain code changes. In this article, we propose Collector-Sahab, a novel tool that augments code diffs with runtime difference information. Collector-Sahab compares the program states of the original (old) and patched (new) versions of a program to find unique variable values. Then, Collector-Sahab adds this novel runtime information to the source code diff as shown, for instance, in code reviewing systems. As an evaluation, we run Collector-Sahab on 584 code diffs for Defects4J bugs and find it successfully augments the code diff for 95% (555/584) of them. We also perform a user study and ask eight participants to score the augmented code diffs generated by Collector-Sahab. Per this user study, we conclude that developers find the idea of adding runtime data to code diffs promising and useful. Overall, our experiments show the effectiveness and usefulness of Collector-Sahab in augmenting code diffs with runtime difference information. Publicly-available repository: https://github.com/ASSERT-KTH/collector-sahab. }, year = {2023} } @article{Ye1638221, author = {Ye, He and Gu, Jian and Martinez, M. and Durieux, Thomas and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20220216}, number = {8}, pages = {2920--2938}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {Automated Classification of Overfitting Patches with Statically Extracted Code Features}, volume = {48}, DOI = {10.1109/TSE.2021.3071750}, keywords = {Automatic program repair, Code features, Feature extraction, Maintenance engineering, Overfitting patch, Patch assessment, Predictive models, Software, Syntactics, Tools, Training}, abstract = {Automatic program repair (APR) aims to reduce the cost of manually fixing software defects. However, APR suffers from generating a multitude of overfitting patches, those patches that fail to correctly repair the defect beyond making the tests pass. This paper presents a novel overfitting patch detection system called ODS to assess the correctness of APR patches. ODS first statically compares a patched program and a buggy program in order to extract code features at the abstract syntax tree (AST) level. Then, ODS uses supervised learning with the captured code features and patch correctness labels to automatically learn a probabilistic model. The learned ODS model can then finally be applied to classify new and unseen program repair patches. We conduct a large-scale experiment to evaluate the effectiveness of ODS on patch correctness classification based on 10,302 patches from Defects4J, Bugs.jar and Bears benchmarks. The empirical evaluation shows that ODS is able to correctly classify 71.9% of program repair patches from 26 projects, which improves the state-of-the-art. ODS is applicable in practice and can be employed as a post-processing procedure to classify the patches generated by different APR systems.  }, year = {2022} } @article{Ye1542036, author = {Ye, He and Martinez, Matias and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Valenciennes, Valenciennes, France.}, journal = {Empirical Software Engineering}, note = {QC 20210406}, number = {2}, eid = {20}, publisher = {Springer Nature}, title = {Automated patch assessment for program repair at scale}, volume = {26}, DOI = {10.1007/s10664-020-09920-w}, keywords = {Automatic program repair, Automatic patch assessment}, abstract = {In this paper, we do automatic correctness assessment for patches generated by program repair systems. We consider the human-written patch as ground truth oracle and randomly generate tests based on it, a technique proposed by Shamshiri et al., called Random testing with Ground Truth (RGT) in this paper. We build a curated dataset of 638 patches for Defects4J generated by 14 state-of-the-art repair systems, we evaluate automated patch assessment on this dataset. The results of this study are novel and significant: First, we improve the state of the art performance of automatic patch assessment with RGT by 190% by improving the oracle; Second, we show that RGT is reliable enough to help scientists to do overfitting analysis when they evaluate program repair systems; Third, we improve the external validity of the program repair knowledge with the largest study ever. }, year = {2021} } @article{Danglot1365812, author = {Danglot, Benjamin and Vera-Perez, Oscar Luis and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Inria Lille Nord Europe, Parc Sci Haute Borne 40,Ave Halley,Bat A,Pk Plaza, F-59650 Villeneuve Dascq, France.}, institution = {Inria Rennes Bretagne Atlantique, Campus Beaulieu,263 Ave Gen Leclerc, F-35042 Rennes, France.}, journal = {Empirical Software Engineering}, note = {QC 20191025}, number = {4}, pages = {2603--2635}, title = {Automatic test improvement with DSpot : a study with ten mature open-source projects}, volume = {24}, DOI = {10.1007/s10664-019-09692-y}, keywords = {Test improvement, Junit test, Pull request empirical study}, abstract = {In the literature, there is a rather clear segregation between manually written tests by developers and automatically generated ones. In this paper, we explore a third solution: to automatically improve existing test cases written by developers. We present the concept, design and implementation of a system called DSpot, that takes developer-written test cases as input (JUnit tests in Java) and synthesizes improved versions of them as output. Those test improvements are given back to developers as patches or pull requests, that can be directly integrated in the main branch of the test code base. We have evaluated DSpot in a deep, systematic manner over 40 real-world unit test classes from 10 notable and open-source software projects. We have amplified all test methods from those 40 unit test classes. In 26/40 cases, DSpot is able to automatically improve the test under study, by triggering new behaviors and adding new valuable assertions. Next, for ten projects under consideration, we have proposed a test improvement automatically synthesized by DSpot to the lead developers. In total, 13/19 proposed test improvements were accepted by the developers and merged into the main code base. This shows that DSpot is capable of automatically improving unit-tests in real-world, large scale Java software. }, year = {2019} } @inproceedings{Madeiral1322959, author = {Madeiral, F. and Urli, S. and Maia, M. and Monperrus, Martin}, booktitle = {SANER 2019 - Proceedings of the 2019 IEEE 26th International Conference on Software Analysis, Evolution, and Reengineering : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20190611Part of ISBN 9781728105918}, pages = {468--478}, eid = {8667991}, title = {BEARS : An Extensible Java Bug Benchmark for Automatic Program Repair Studies}, DOI = {10.1109/SANER.2019.8667991}, keywords = {Java programming language, Open source software, Pipelines, Reengineering, Repair, Software testing, Automatic creations, Automatic programs, Bug tracking system, Continuous integrations, Open source projects, Repair tools, Research communities, Test failure, Program debugging}, abstract = {Benchmarks of bugs are essential to empirically evaluate automatic program repair tools. In this paper, we present BEARS, a project for collecting and storing bugs into an extensible bug benchmark for automatic repair studies in Java. The collection of bugs relies on commit building state from Continuous Integration (CI) to find potential pairs of buggy and patched program versions from open-source projects hosted on GitHub. Each pair of program versions passes through a pipeline where an attempt of reproducing a bug and its patch is performed. The core step of the reproduction pipeline is the execution of the test suite of the program on both program versions. If a test failure is found in the buggy program version candidate and no test failure is found in its patched program version candidate, a bug and its patch were successfully reproduced. The uniqueness of Bears is the usage of CI (builds) to identify buggy and patched program version candidates, which has been widely adopted in the last years in open-source projects. This approach allows us to collect bugs from a diversity of projects beyond mature projects that use bug tracking systems. Moreover, BEARS was designed to be publicly available and to be easily extensible by the research community through automatic creation of branches with bugs in a given GitHub repository, which can be used for pull requests in the BEARS repository. We present in this paper the approach employed by BEARS, and we deliver the version 1.0 of BEARS, which contains 251 reproducible bugs collected from 72 projects that use the Travis CI and Maven build environment. }, year = {2019} } @inproceedings{ReyesGarcía1932571, author = {Reyes García, Frank and Baudry, Benoit and Monperrus, Martin}, booktitle = {Proceedings - 2024 IEEE International Conference on Source Code Analysis and Manipulation, SCAM 2024 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Université de Montréal, Montréal, Canada}, note = {Part of ISBN 9798331528508QC 20260414}, pages = {36--46}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Breaking-Good : Explaining Breaking Dependency Updates with Build Analysis}, DOI = {10.1109/SCAM63643.2024.00014}, keywords = {Breaking dependency updates, Explanations, Java, Maven, Software Dependency}, abstract = {Dependency updates often cause compilation errors when new dependency versions introduce changes that are incompatible with existing client code. Fixing breaking dependency updates is notoriously hard, as their root cause can be hidden deep in the dependency tree. We present Breaking-Good, a tool that automatically generates explanations for breaking updates. Breaking-Good provides a detailed categorization of compilation errors, identifying several factors related to changes in direct and indirect dependencies, incompatibilities between Java versions, and client-specific configuration. With a blended analysis of log and dependency trees, Breaking-Good generates detailed explanations for each breaking update. These explanations help developers understand the causes of the breaking update, and suggest possible actions to fix the breakage. We evaluate Breaking-Good on 243 real-world breaking dependency updates. Our results indicate that Breaking-Good accurately identifies root causes and generates automatic explanations for 70 % of these breaking updates. Our user study demonstrates that the generated explanations help developers. Breaking-Good is the first technique that automatically identifies the causes of a breaking dependency update and explains the breakage accordingly. }, year = {2024} } @inproceedings{ReyesGarcía1888722, author = {Reyes García, Frank and Gamage, Yogya and Skoglund, Gabriel and Baudry, Benoit and Monperrus, Martin}, booktitle = {Proceedings - 2024 IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2024 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, note = { Part of ISBN 9798350330663QC 20240823}, pages = {159--170}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {BUMP : A Benchmark of Reproducible Breaking Dependency Updates}, DOI = {10.1109/SANER60148.2024.00024}, keywords = {Benchmark, Breaking dependency updates, Dependency engineering, Java, Maven, Reproducibility}, abstract = {Third-party dependency updates can cause a build to fail if the new dependency version introduces a change that is incompatible with the usage: this is called a breaking dependency update. Research on breaking dependency updates is active, with works on characterization, understanding, automatic repair of breaking updates, and other software engineering aspects. All such research projects require a benchmark of breaking updates that has the following properties: 1) it contains real-world breaking updates; 2) the breaking updates can be executed; 3) the benchmark provides stable scientific artifacts of breaking updates over time, a property we call 'reproducibility'. To the best of our knowledge, such a benchmark is missing. To address this problem, we present BUMP, a new benchmark that contains reproducible breaking dependency updates in the context of Java projects built with the Maven build system. BUMP contains 571 breaking dependency updates collected from 153 Java projects. BUMP ensures long-term reproducibility of dependency updates on different platforms, guaranteeing consistent build failures. We categorize the different causes of build breakage in BUMP, providing novel insights for future work on breaking update engineering. To our knowledge, BUMP is the first of its kind, providing hundreds of real-world breaking updates that have all been made reproducible. }, year = {2024} } @article{Sharma2044960, author = {Sharma, Aman and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science}, institution = {Université de Montréal, Montréal, Canada, H3T 1J4}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20260311}, number = {1}, pages = {54--69}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Causes and Canonicalization of Unreproducible Builds in Java}, volume = {52}, DOI = {10.1109/TSE.2025.3627891}, keywords = {Java, Reproducible builds, canonicalization, software supply chain}, abstract = {The increasing complexity of software supply chains and the rise of supply chain attacks have elevated concerns around software integrity. Users and stakeholders face significant challenges in validating that a given software artifact corresponds to its declared source. Reproducible Builds address this challenge by ensuring that independently performed builds from identical source code produce identical binaries. However, achieving reproducibility at scale remains difficult, especially in Java, due to a range of non-deterministic factors and caveats in the build process. In this work, we focus on reproducibility in Java-based software, archetypal of enterprise applications. We introduce a conceptual framework for reproducible builds, we analyze a large dataset from Reproducible Central, and we develop a novel taxonomy of six root causes of unreproducibility. We study actionable mitigations: artifact and bytecode canonicalization using OSS-Rebuild and jNorm respectively. Finally, we present Chains-Rebuild (improvements to OSS-Rebuild), a tool that raises reproducibility success from 9.48% to 26.60% on 12,803 unreproducible artifacts. To sum up, our contributions are the first large-scale taxonomy of build unreproducibility causes in Java, a publicly available dataset of unreproducible builds, and Chains-Rebuild, a canonicalization tool for mitigating unreproducible builds in Java. }, year = {2026} } @article{Balliu1842523, author = {Balliu, Musard and Baudry, Benoit and Bobadilla, Sofia and Ekstedt, Mathias and Monperrus, Martin and Ron Arteaga, Javier and Sharma, Aman and Skoglund, Gabriel and Soto Valero, C{\’e;}sar and Wittlinger, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Network and Systems Engineering}, journal = {IEEE Security and Privacy}, note = {QC 20240314}, number = {6}, pages = {12--23}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Challenges of Producing Software Bill of Materials for Java}, volume = {21}, DOI = {10.1109/MSEC.2023.3302956}, keywords = {Java, Software, Production, Supply chain management, Standards, Bills of materials, Software reliability}, abstract = {Software bills of materials (SBOMs) promise to become the backbone of software supply chain hardening. We deep-dive into six tools and the SBOMs they produce for complex open source Java projects, revealing challenges regarding the accurate production and usage of SBOMs. }, year = {2023} } @unpublished{Zhang1639560, author = {Zhang, Long and Ron Arteaga, Javier and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, note = {QC 20220222}, title = {Chaos Engineering of Ethereum Blockchain Clients}, DOI = {10.48550/arXiv.2111.00221}, keywords = {chaos engineering, Ethereum, fault injection, resilience benchmarking}, abstract = {The Ethereum blockchain is the operational backbone of major decentralized finance platforms. As such, it is expected to be exceptionally reliable. In this paper, we present ChaosETH, a chaos engineering tool for resilience assessment of Ethereum clients. ChaosETH operates in the following manner: First, it monitors Ethereum clients to determine their normal behavior. Then, it injects system call invocation errors into the Ethereum clients and observes the resulting behavior under perturbation. Finally, ChaosETH compares the behavior recorded before, during, and after perturbation to assess the impact of the injected system call invocation errors. The experiments are performed on the two most popular Ethereum client implementations: GoEthereum and OpenEthereum. We experiment with 22 different types of system call invocation errors. We assess their impact on the Ethereum clients with respect to 15 application-level metrics. Our results reveal a broad spectrum of resilience characteristics of Ethereum clients in the presence of system call invocation errors, ranging from direct crashes to full resilience. The experiments clearly demonstrate the feasibility of applying chaos engineering principles to blockchains. }, URL = {https://doi.org/10.48550/arXiv.2111.00221}, } @article{Yu1564402, author = {Yu, Zhongxing and Bai, Chenggang and Seinturier, Lionel and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Beihang Univ, Dept Automat Control, Beijing Univ Aeronaut & Astronaut, Beijing 100191, Peoples R China.;Beihang Univ, Dept Automat Control, Beijing 100191, Peoples R China.}, institution = {Inria Lille Nord Europe, F-59650 Villeneuve Dascq, France.;Univ Lille, Comp Sci, F-59000 Lille, France.;Univ Lille, Comp Sci Dept, F-59000 Lille, France.}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20210611}, number = {5}, pages = {969--986}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Characterizing the Usage, Evolution and Impact of Java Annotations in Practice}, volume = {47}, DOI = {10.1109/TSE.2019.2910516}, keywords = {Annotations, Java, Tools, Libraries, Runtime, Open source software, Annotation, software evolution, empirical study, statistical modelling}, abstract = {Annotations have been formally introduced into Java since Java 5. Since then, annotations have been widely used by the Java community for different purposes, such as compiler guidance and runtime processing. Despite the ever-growing use, there is still limited empirical knowledge about the actual usage of annotations in practice, the changes made to annotations during software evolution, and the potential impact of annotations on code quality. To fill this gap, we perform the first large-scale empirical study about Java annotations on 1,094 notable open-source projects hosted on GitHub. Our study systematically investigates annotation usage, annotation evolution, and annotation impact, and generates 10 novel and important findings. We also present the implications of our findings, which shed light for developers, researchers, tool builders, and language or library designers in order to improve all facets of Java annotation engineering. }, year = {2021} } @unpublished{Hidvegi1907148, author = {Hidv{\’e;}gi, D{\’a;}vid and Etemadi, Khashayar and Bobadilla, Sofia and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20241023}, title = {CigaR: Cost-efficient Program Repair with LLMs}, } @inproceedings{Martinez1421426, author = {Martinez, Matias and Monperrus, Martin}, booktitle = {2019 IEEE/ACM 41st International Conference on Software Engineering : Companion Proceedings (ICSE-Companion 2019)}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Polytech Hauts De France, Valenciennes, France.}, note = {QC 20200402Part of ISBN 978-1-7281-1764-5; 978-1-7281-1765-2 }, pages = {79--82}, publisher = {IEEE}, title = {Coming : Tool for Mining Change Pattern Instances from Git Commits}, DOI = {10.1109/ICSE-Companion.2019.00043}, abstract = {Software repositories such as Git have become a relevant source of information for software engineer researchers. For instance, the detection of commits that fulfill a given criterion (e.g., bugfixing commits) is one of the most frequent tasks done to understand the software evolution. However, to our knowledge, there is no open-source tool that, given a Git repository, returns all the instances of a given code change pattern. In this paper we present Coming, a tool that takes as input a Git repository and mines instances of code change patterns present on each commit. For that, Coming computes fine-grained code changes between two consecutive revisions, analyzes those changes to determine if they correspond to an instance of a change pattern (specified by the user using XML), and finally, after analyzing all the commits, it presents a) the frequency of code changes and b) the instances found in each commit. We evaluate Coming on a set of 28 pairs of revisions from Defects4J, finding instances of change patterns that involve If conditions on 26 of them. }, URL = {https://2019.icse-conferences.org/}, URL = {https://ieeexplore.ieee.org/xpl/conhome/8790387/proceeding}, year = {2019} } @inproceedings{Danglot1280393, author = {Danglot, Benjamin and Preux, Philippe and Baudry, Benoit and Monperrus, Martin}, booktitle = {PROCEEDINGS 2018 IEEE/ACM 40TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE) : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20190118}, pages = {481--481}, publisher = {IEEE}, title = {Correctness Attraction : A Study of Stability of Software Behavior Under Runtime Perturbation}, DOI = {10.1145/3180155.3182548}, year = {2018} } @article{Danglot1269676, author = {Danglot, Benjamin and Preux, Philippe and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Centre for Advanced Software Technology Research (CASTOR)}, journal = {Empirical Software Engineering}, note = {QC 20181211}, number = {4}, pages = {2086--2119}, title = {Correctness attraction : a study of stability of software behavior under runtime perturbation}, volume = {23}, DOI = {10.1007/s10664-017-9571-8}, keywords = {Perturbation analysis; Software correctness; Empirical study}, abstract = {Can the execution of software be perturbed without breaking the correctness of the output? In this paper, we devise a protocol to answer this question from a novel perspective. In an experimental study, we observe that many perturbations do not break the correctness in ten subject programs. We call this phenomenon “correctness attraction”. The uniqueness of this protocol is that it considers a systematic exploration of the perturbation space as well as perfect oracles to determine the correctness of the output. To this extent, our findings on the stability of software under execution perturbations have a level of validity that has never been reported before in the scarce related work. A qualitative manual analysis enables us to set up the first taxonomy ever of the reasons behind correctness attraction. }, year = {2018} } @inproceedings{CabreraArteaga1543343, author = {Cabrera Arteaga, Javier and Floros, Orestis and Vera Perez, Oscar and Baudry, Benoit and Monperrus, Martin}, booktitle = { : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Rennes, Inria, CNRS, IRISA}, note = {Part of proceedings: ISBN 1-891562-66-5, QC 20230117}, title = {CROW: Code Diversification for WebAssembly}, DOI = {10.14722/madweb.2021.23004}, keywords = {WebAssembly, Web, Diversification}, abstract = {The adoption of WebAssembly increases rapidly, as it provides a fast and safe model for program execution in the browser. However, WebAssembly is not exempt from vulnerabilities that can be exploited by malicious observers. Code diversification can mitigate some of these attacks. In this paper, we present the first fully automated workflow for the diversification of WebAssembly binaries. We present CROW, an open-source tool implementing this workflow through enumerative synthesis of diverse code snippets expressed in the LLVMintermediate representation. We evaluate CROW’s capabilitieson303C programs and study its use on a real-life security-sensitive program: libsodium, a modern cryptographic library. Overall, CROW is able to generate diverse variants for239out of303 (79%)small programs. Furthermore, our experiments show that our approach and tool is able to successfully diversify off-the-shelf cryptographic software (libsodium). }, URL = {https://dx.doi.org/10.14722/madweb.2021.23004}, year = {2021} } @inproceedings{Vera-Perez1673477, author = {Vera-P{\’e;}rez, O. L. and Monperrus, Martin and Baudry, Benoit}, booktitle = {ASE 2018 - Proceedings of the 33rd ACM/IEEE International Conference on Automated Software Engineering : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, note = {Part of proceedings: ISBN 978-1-4503-5937-5QC 20220621}, pages = {908--911}, publisher = {Association for Computing Machinery (ACM)}, title = {Descartes : A pitest engine to detect pseudo-tested methods: Tool demonstration}, DOI = {10.1145/3238147.3240474}, keywords = {Extreme mutation, Mutation testing, PITest, Pseudo-tested methods, Software testing, Engines, Open source software, Mutation operators, Mutation score, Open source projects, Tool demonstration}, abstract = {Descartes is a tool that implements extreme mutation operators and aims at finding pseudo-tested methods in Java projects. It leverages the efficient transformation and runtime features of PITest. The demonstration compares Descartes with Gregor, the default mutation engine provided by PITest, in a set of real open source projects. It considers the execution time, number of mutants created and the relationship between the mutation scores produced by both engines. It provides some insights on the main features exposed by Descartes. }, URL = {http://www.ase2018.com/}, year = {2018} } @inproceedings{Liu2000817, author = {Liu, Raphina and Bobadilla, Sofia and Baudry, Benoit and Monperrus, Martin}, booktitle = {FSE Companion 2025 - Companion Proceedings of the 33rd ACM International Conference on the Foundations of Software Engineering : }, institution = {KTH}, institution = {KTH, Theoretical Computer Science, TCS}, note = {Part of ISBN 9798400712760QC 20250925}, pages = {1045--1049}, publisher = {Association for Computing Machinery (ACM)}, title = {Dirty-Waters: Detecting Software Supply Chain Smells}, DOI = {10.1145/3696630.3728578}, keywords = {Open Source, Software Security, Software Supply Chain}, abstract = {Using open-source dependencies is essential in modern software development. However, this practice implies significant trust in third-party code, while there is little support for developers to assess this trust. As a consequence, attacks, called software supply chain attacks, have been increasingly occurring through third-party dependencies. In this paper, we target the problem of projects that use dependencies, where developers are unaware of the potential risks posed by their software supply chain. We define the novel concept of software supply chain smell and present Dirty-Waters, a novel tool for detecting software supply chain smells. We evaluate Dirty-Waters on three JavaScript projects and demonstrate the prevalence of all proposed software supply chain smells. Dirty-Waters reveals potential risks for previously invisible problems and provides clear indicators for developers to act on the security of their supply chain. A video demonstrating Dirty-Waters is available at: http://l.4open.science/dirty-waters-demo. }, year = {2025} } @inproceedings{Sobreira1243077, author = {Sobreira, Victor and Durieux, Thomas and Madeiral, Fernanda and Monperrus, Martin and De Almeida Maia, Marcelo}, booktitle = {25th IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2018 - Proceedings : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Fed Uberlandia, Uberlandia, MG, Brazil.}, institution = {INRIA, Rocquencourt, France.;Univ Lille, Lille, France.}, institution = {Univ Fed Uberlandia, Uberlandia, MG, Brazil.}, institution = {Univ Fed Uberlandia, Uberlandia, MG, Brazil.}, note = {Part of proceedingsg: ISBN 978-1-5386-4969-5QC 20180830}, pages = {130--140}, title = {Dissection of a bug dataset : Anatomy of 395 patches from Defects4J}, DOI = {10.1109/SANER.2018.8330203}, abstract = {Well-designed and publicly available datasets of bugs are an invaluable asset to advance research fields such as fault localization and program repair as they allow directly and fairly comparison between competing techniques and also the replication of experiments. These datasets need to be deeply understood by researchers: The answer for questions like 'which bugs can my technique handle?' and 'for which bugs is my technique effective?' depends on the comprehension of properties related to bugs and their patches. However, such properties are usually not included in the datasets, and there is still no widely adopted methodology for characterizing bugs and patches. In this work, we deeply study 395 patches of the Defects4J dataset. Quantitative properties (patch size and spreading) were automatically extracted, whereas qualitative ones (repair actions and patterns) were manually extracted using a thematic analysis-based approach. We found that 1) the median size of Defects4J patches is four lines, and almost 30% of the patches contain only addition of lines; 2) 92% of the patches change only one file, and 38% has no spreading at all; 3) the top-3 most applied repair actions are addition of method calls, conditionals, and assignments, occurring in 77% of the patches; and 4) nine repair patterns were found for 95% of the patches, where the most prevalent, appearing in 43% of the patches, is on conditional blocks. These results are useful for researchers to perform advanced analysis on their techniques' results based on Defects4J. Moreover, our set of properties can be used to characterize and compare different bug datasets. }, year = {2018} } @article{Bobadilla2047171, author = {Bobadilla, Sofia and Jin, Monica and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20260319}, number = {1}, pages = {100--115}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Do Automated Fixes Truly Mitigate Smart Contract Exploits?}, volume = {52}, DOI = {10.1109/TSE.2025.3618123}, keywords = {Smart contracts, Maintenance engineering, Codes, Source coding, Blockchains, Prevention and mitigation, Manuals, Static analysis, Systematic literature review, Formal verification}, abstract = {Automated Program Repair (APR) for smart contract security promises to automatically mitigate smart contract vulnerabilities responsible for billions in financial losses. However, the true effectiveness of this research in addressing smart contract exploits remains uncharted territory. This paper bridges this critical gap by introducing a novel and systematic experimental framework for evaluating exploit mitigation of program repair tools for smart contracts. We qualitatively and quantitatively analyze 20 state-of-the-art APR tools using a dataset of 143 vulnerable smart contracts, for which we manually craft 91 executable exploits. We are the very first to define and measure the essential "exploit mitigation rate", giving researchers and practitioners a real sense of effectiveness. Our findings reveal substantial disparities in the state of the art, with an exploit mitigation rate ranging from a low of 29% to a high of 74%. Our study identifies systemic limitations, such as inconsistent functionality preservation, that must be addressed in future research on program repair for smart contracts. }, year = {2026} } @misc{Baudry1639160, author = {Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20220314}, title = {Dynamic Analysis in the Browser}, URL = {https://cacm.acm.org/blogs/blog-cacm/239266-dynamic-analysis-in-the-browser/fulltext}, year = {2019} } @article{Monperrus1240389, author = {Monperrus, Martin and Weimer, Westley}, institution = {KTH}, institution = {Univ Michigan, Ann Arbor, MI 48109 USA.}, journal = {Empirical Software Engineering}, note = {QC 20180821}, number = {5}, pages = {2865--2865}, title = {Editor's Note : Special Issue on Automatic Software Repair}, volume = {23}, DOI = {10.1007/s10664-018-9632-7}, year = {2018} } @article{Etemadi1658845, author = {Etemadi, Khashayar and Tarighat, Niloofar and Yadav, Siddharth and Martinez, Matias and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Sharif Univ Technol, Tehran, Iran.}, institution = {Indraprastha Inst Informat Technol Delhi, Delhi, India.}, institution = {Univ Polytech Hauts De France, Valenciennes, France.}, journal = {Journal of Systems and Software}, note = {QC 20220817}, eid = {111263}, publisher = {Elsevier BV}, title = {Estimating the potential of program repair search spaces with commit analysis}, volume = {188}, DOI = {10.1016/j.jss.2022.111263}, keywords = {Program repair, Search-space, Static code analysis, Commit analysis}, abstract = {The most natural method for evaluating program repair systems is to run them on bug datasets, such as Defects4J. Yet, using this evaluation technique on arbitrary real-world programs requires heavy configuration. In this paper, we propose a purely static method to evaluate the potential of the search space of repair approaches. This new method enables researchers and practitioners to encode the search spaces of repair approaches and select potentially useful ones without struggling with tool configuration and execution. We encode the search spaces by specifying the repair strategies they employ. Next, we use the specifications to check whether past commits lie in repair search spaces. For a repair approach, including many human-written past commits in its search space indicates its potential to generate useful patches. We implement our evaluation method in LIGHTER. LIGHTER gets a Git repository and outputs a list of commits whose source code changes lie in repair search spaces. We run LIGHTER on 55,309 commits from the history of 72 Github repositories with and show that LIGHTER's precision and recall are 77% and 92%, respectively. Overall, our experiments show that our novel method is both lightweight and effective to study the search space of program repair approaches. }, year = {2022} } @inproceedings{Durieux1245426, author = {Durieux, Thomas and Hamadi, Youssef and Yu, Zhongxing and Baudry, Benoit and Monperrus, Martin}, booktitle = {2018 IEEE 11TH INTERNATIONAL CONFERENCE ON SOFTWARE TESTING, VERIFICATION AND VALIDATION (ICST) : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Lille, Lille, France.;INRIA, Le Chesnay, France.}, institution = {Ecole Polytech, Palaiseau, France.}, institution = {Univ Lille, Lille, France.;INRIA, Le Chesnay, France.}, note = {QC 20180905}, pages = {139--149}, title = {Exhaustive Exploration of the Failure-oblivious Computing Search Space}, series = {IEEE International Conference on Software Testing Verification and Validation}, DOI = {10.1109/ICST.2018.00023}, abstract = {High-availability of software systems requires automated handling of crashes in presence of errors. Failure-oblivious computing is one technique that aims to achieve high availability. We note that failure-obliviousness has not been studied in depth yet, and there is very few study that helps understand why failure-oblivious techniques work. In order to make failure-oblivious computing to have an impact in practice, we need to deeply understand failure-oblivious behaviors in software. In this paper, we study, design and perform an experiment that analyzes the size and the diversity of the failure-oblivious behaviors. Our experiment consists of exhaustively computing the search space of 16 field failures of large-scale open-source Java software. The outcome of this experiment is a much better understanding of what really happens when failure-oblivious computing is used, and this opens new promising research directions. }, ISBN = {978-1-5386-5012-7}, year = {2018} } @inproceedings{Monperrus1417230, author = {Monperrus, Martin}, booktitle = {Proceedings - 2019 IEEE/ACM 1st International Workshop on Bots in Software Engineering, BotSE 2019 : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20200327}, pages = {12--15}, title = {Explainable software bot contributions : Case study of automated bug fixes}, DOI = {10.1109/BotSE.2019.00010}, abstract = {In a software project, esp. in open-source, a contribution is a valuable piece of work made to the project: writing code, reporting bugs, translating, improving documentation, creating graphics, etc. We are now at the beginning of an exciting era where software bots will make contributions that are of similar nature than those by humans. Dry contributions, with no explanation, are often ignored or rejected, because the contribution is not understandable per se, because they are not put into a larger context, because they are not grounded on idioms shared by the core community of developers. We have been operating a program repair bot called Repairnator for 2 years and noticed the problem of "dry patches": a patch that does not say which bug it fixes, or that does not explain the effects of the patch on the system. We envision program repair systems that produce an "explainable bug fix": an integrated package of at least 1) a patch, 2) its explanation in natural or controlled language, and 3) a highlight of the behavioral difference with examples. In this paper, we generalize and suggest that software bot contributions must explainable, that they must be put into the context of the global software development conversation. }, URL = {https://ieeexplore.ieee.org/document/8823632}, year = {2019} } @article{Koyuncu1435645, author = {Koyuncu, Anil and Liu, Kui and Bissyande, Tegawende F. and Kim, Dongsun and Klein, Jacques and Monperrus, Martin and Le Traon, Yves}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, institution = {Furiosa Ai, 145 Dosan Daero, Seoul, South Korea.}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, journal = {Empirical Software Engineering}, note = {QC 20200605}, number = {3}, pages = {1980--2024}, publisher = {Springer}, title = {FixMiner : Mining relevant fix patterns for automated program repair}, volume = {25}, DOI = {10.1007/s10664-019-09780-z}, keywords = {Fix patterns, Patches, Program repair, Debugging, Empirical software engineering}, abstract = {Patching is a common activity in software development. It is generally performed on a source code base to address bugs or add new functionalities. In this context, given the recurrence of bugs across projects, the associated similar patches can be leveraged to extract generic fix actions. While the literature includes various approaches leveraging similarity among patches to guide program repair, these approaches often do not yield fix patterns that are tractable and reusable as actionable input to APR systems. In this paper, we propose a systematic and automated approach to mining relevant and actionable fix patterns based on an iterative clustering strategy applied to atomic changes within patches. The goal of FixMiner is thus to infer separate and reusable fix patterns that can be leveraged in other patch generation systems. Our technique, FixMiner, leverages Rich Edit Script which is a specialized tree structure of the edit scripts that captures the AST-level context of the code changes. FixMiner uses different tree representations of Rich Edit Scripts for each round of clustering to identify similar changes. These are abstract syntax trees, edit actions trees, and code context trees. We have evaluated FixMiner on thousands of software patches collected from open source projects. Preliminary results show that we are able to mine accurate patterns, efficiently exploiting change information in Rich Edit Scripts. We further integrated the mined patterns to an automated program repair prototype, PAR(FixMiner), with which we are able to correctly fix 26 bugs of the Defects4J benchmark. Beyond this quantitative performance, we show that the mined fix patterns are sufficiently relevant to produce patches with a high probability of correctness: 81% of PAR(FixMiner)'s generated plausible patches are correct. }, year = {2020} } @inproceedings{Tan1651818, author = {Tan, Shin Hwei and Mechtaev, Sergey and Zhang, Lingming and Monperrus, Martin}, booktitle = {Proceedings - 2021 IEEE/ACM International Workshop on Automated Program Repair, APR 2021 : APR 2021}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Southern University of Science and Technology, China}, institution = {University College, London, United Kingdom}, institution = {University of Illinois, Urbana-Champaign, United States}, note = {Part of proceedings: ISBN 978-1-6654-4472-9QC 20220413}, eid = {9474539}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Foreword}, DOI = {10.1109/APR52552.2021.00005}, abstract = {Presents the introductory welcome message from the conference proceedings. May include the conference officers' congratulations to all involved with the conference event and publication of the proceedings record. }, year = {2021} } @article{Durieux1416641, author = {Durieux, Thomas and Hamadi, Youssef and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Lisbon, INESC ID, Lisbon, Portugal.;Univ Lisbon, IST, Lisbon, Portugal.}, institution = {Uber Elevate Res, Paris, France.}, journal = {Software testing, verification & reliability}, note = {QC 20200324}, number = {2}, eid = {e1731}, publisher = {WILEY}, title = {Fully Automated HTML and JavaScript Rewriting for Constructing a Self-healing Web Proxy}, volume = {30}, DOI = {10.1002/stvr.1731}, keywords = {self-healing, bugs, JavaScript, proxy, chrome extension}, abstract = {Over the last few years, the complexity of web applications has increased to provide more dynamic web applications to users. The drawback of this complexity is the growing number of errors in the front-end applications. In this paper, we present an approach to provide self-healing for the web. We implemented this approach in two different tools: (i) BikiniProxy, an HTTP repair proxy, and (ii) BugBlock, a browser extension. They use five self-healing strategies to rewrite the buggy HTML and JavaScript code to handle errors in web pages. We evaluate BikiniProxy and BugBlock with a new benchmark of 555 reproducible JavaScript errors of which 31.76% can be automatically self-healed by BikiniProxy and 15.67% by BugBlock. }, year = {2020} } @inproceedings{Durieux1319732, author = {Durieux, Thomas and Hamadi, Youssef and Monperrus, Martin}, booktitle = {2018 29TH IEEE INTERNATIONAL SYMPOSIUM ON SOFTWARE RELIABILITY ENGINEERING (ISSRE) : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {INRIA, Lille, France.;Univ Lille, Lille, France.}, institution = {Ecole Polytech, Paris, France.}, note = {QC 20190603}, pages = {1--12}, publisher = {IEEE}, title = {Fully Automated HTML and Javascript Rewriting for Constructing a Self-healing Web Proxy}, series = {Proceedings International Symposium on Software Reliability Engineering}, DOI = {10.1109/ISSRE.2018.00012}, keywords = {}, abstract = {Over the last few years, the complexity of web applications has increased to provide more dynamic web applications to users. The drawback of this complexity is the growing number of errors in the front-end applications. In this paper, we present BikiniProxy, a novel technique to provide self-healing for the web. BikiniProxy is designed as an HTTP proxy that uses five self-healing strategies to rewrite the buggy HTML and Javascript code. We evaluate BikiniProxy with a new benchmark of 555 reproducible Javascript errors of which 31.76% can be automatically self-healed. }, ISBN = {978-1-5386-8321-7}, year = {2018} } @article{Baudry1909148, author = {Baudry, Benoit and Etemadi, Khashayar and Fang, Sen and Gamage, Yogya and Liu, Yi and Liu, Yuxin and Monperrus, Martin and Ron Arteaga, Javier and Silva, Andre and Tiwari, Deepika}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {Univ Montreal, Software Technol, Montreal, PQ H3T 1J4, Canada.}, institution = {North Carolina State Univ, Raleigh, NC 27606 USA.}, journal = {IEEE Software}, note = {QC 20241030}, number = {6}, pages = {55--64}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Generative AI to Generate Test Data Generators}, volume = {41}, DOI = {10.1109/MS.2024.3418570}, keywords = {Generators, Cultural differences, Testing, Libraries, Java, Codes, Vectors}, abstract = {High quality data is essential for designing effective software test suites. We propose three original methods for using large language models to generate representative test data, which fit to the domain of the program under test and are culturally adequate. }, year = {2024} } @inproceedings{Saavedra1869243, author = {Saavedra, Nuno and Silva, Andr{\’e;} and Monperrus, Martin}, booktitle = {Proceedings - 2024 ACM/IEEE 46th International Conference on Software Engineering: Companion, ICSE-Companion 2024 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {INESC-ID/IST, University of Lisbon, Lisbon, Portugal}, note = { Part of ISBN 979-840070502-1QC 20240613}, pages = {1--5}, publisher = {Association for Computing Machinery (ACM)}, title = {GitBug-Actions: Building Reproducible Bug-Fix Benchmarks with GitHub Actions}, series = {Proceedings - International Conference on Software Engineering}, DOI = {10.1145/3639478.3640023}, keywords = {Bug Benchmark, Bug Database, GitHub Actions, Program Analysis, Reproducibility, Software Bugs, Software Testing}, abstract = {Bug-fix benchmarks are fundamental in advancing various subfields of software engineering such as automatic program repair (APR) and fault localization (FL). A good benchmark must include recent examples that accurately reflect t echnologies a nd development practices of today. To be executable in the long term, a benchmark must feature test suites that do not degrade overtime due to, for example, dependencies that are no longer available. Existing benchmarks fail in meeting both criteria. For instance, Defects4J, one of the foremost Java benchmarks, last received an update in 2020. Moreover, full-reproducibility has been neglected by the majority of existing benchmarks. In this paper, we present GitBug-Actions: a novel tool for building bug-fix benchmarks with modern and fully-reproducible bug-fixes. GitBug- Actions relies on the most popular CI platform, GitHub Actions, to detect bug-fixes a nd s martly l ocally e xecute t he CI pipeline in a controlled and reproducible environment. To the best of our knowledge, we are the first t o r ely o n G itHub Actions t o collect bug-fixes. To demonstrate our toolchain, we deploy GitBug- Actions to build a proof-of-concept Go bug-fix benchmark containing executable, fully-reproducible bug-fixes from different repositories. A video demonstrating GitBug-Actions is available at: https://youtu.be/aBWwa1sJYBs. }, year = {2024} } @inproceedings{Silva1894742, author = {Silva, Andr{\’e;} and Saavedra, Nuno and Monperrus, Martin}, booktitle = {2024 IEEE/ACM 21St International Conference On Mining Software Repositories, Msr : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Lisbon, INESC ID IST, Lisbon, Portugal.}, note = {Part of ISBN 979-8-3503-6398-2, 979-8-4007-0587-8QC 20240903}, pages = {118--122}, publisher = {Association for Computing Machinery (ACM)}, title = {GitBug-Java : A Reproducible Benchmark of Recent Java Bugs}, series = {IEEE International Working Conference on Mining Software Repositories}, DOI = {10.1145/3643991.3644884}, keywords = {Software Bugs, Bug Benchmark, Reproducibility, Bug Database, Java Benchmark, Software Testing, Program Analysis}, abstract = {Bug-fix benchmarks are essential for evaluating methodologies in automatic program repair (APR) and fault localization (FL). However, existing benchmarks, exemplified by Defects4J, need to evolve to incorporate recent bug-fixes aligned with contemporary development practices. Moreover, reproducibility, a key scientific principle, has been lacking in bug-fix benchmarks. To address these gaps, we present GitBug-Java, a reproducible benchmark of recent Java bugs. GitBug-Java features 199 bugs extracted from the 2023 commit history of 55 notable open-source repositories. The methodology for building GitBug-Java ensures the preservation of bug-fixes in fully-reproducible environments. We publish GitBug-Java at https://github.com/gitbugactions/gitbug- java. }, year = {2024} } @inproceedings{Cesarano1927857, author = {Cesarano, Carmine and Andersson, Vivi and Natella, Roberto and Monperrus, Martin}, booktitle = {SCORED 2024 - Proceedings of the 2024 Workshop on Software Supply Chain Offensive Research and Ecosystem Defenses, Co-Located with: CCS 2024 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Università degli Studi di Napoli Federico II Naples, Italy}, institution = {Università degli Studi di Napoli Federico II Naples, Italy}, note = {Part of ISBN 979-840071240-1QC 20250117}, pages = {33--42}, publisher = {Association for Computing Machinery (ACM)}, title = {GoSurf: Identifying Software Supply Chain Attack Vectors in Go}, DOI = {10.1145/3689944.3696166}, keywords = {Golang, Open-Source Security, Supply Chain Attacks}, abstract = {In Go, the widespread adoption of open-source software has led to a flourishing ecosystem of third-party dependencies, which are often integrated into critical systems. However, the reuse of dependencies introduces significant supply chain security risks, as a single compromised package can have cascading impacts. Existing supply chain attack taxonomies overlook language-specific features that can be exploited by attackers to hide malicious code. In this paper, we propose a novel taxonomy of 12 distinct attack vectors tailored for the Go language and its package lifecycle. Our taxonomy identifies patterns in which language-specific Go features, intended for benign purposes, can be misused to propagate malicious code stealthily through supply chains. Additionally, we introduce GoSurf, a static analysis tool that analyzes the attack surface of Go packages according to our proposed taxonomy. We evaluate GoSurf on a corpus of 500 widely used, real-world Go packages. Our work provides preliminary insights for securing the open-source software supply chain within the Go ecosystem, allowing developers and security analysts to prioritize code audit efforts and uncover hidden malicious behaviors. }, year = {2024} } @inproceedings{Zetterlund1697096, author = {Zetterlund, Louise and Tiwari, Deepika and Monperrus, Martin and Baudry, Benoit}, booktitle = {2022 IEEE Conference on Software Testing, Verification and Validation (ICST 2022) : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Redeye AB, Stockholm, Sweden.}, note = {Part of proceedings: ISBN 978-1-6654-6679-0, QC 20220920}, pages = {365--376}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Harvesting Production GraphQL Queries to Detect Schema Faults}, series = {IEEE International Conference on Software Testing Verification and Validation}, DOI = {10.1109/ICST53961.2022.00014}, keywords = {GraphQL, production monitoring, automated test generation, test oracle, API testing, schema}, abstract = {GraphQL is a new paradigm to design web APIs. Despite its growing popularity, there are few techniques to verify the implementation of a GraphQL, API. We present a new testing approach based on GraphQL queries that are logged while users interact with an application in production. Our core motivation is that production queries capture real usages of the application, and are known to trigger behavior that may not be tested by developers. For each logged query, a test is generated to assert the validity of the GraphQL response with respect to the schema. We implement our approach in a tool called AutoGraphQL, and evaluate it on two real-world case studies that are diverse in their domain and technology stack: an open-source e-commerce application implemented in Python called Saleor, and an industrial case study which is a PHP-based finance website called Frontapp. AutoGraphQL successfully generates test cases for the two applications. The generated tests cover 26.9% of the Saleor schema, including parts of the API not exercised by the original test suite, as well as 48.7% of the Frontapp schema, detecting 8 schema faults, thanks to production queries. }, year = {2022} } @article{RonArteaga1881983, author = {Ron Arteaga, Javier and Soto Valero, C{\’e;}sar and Zhang, Long and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, journal = {IEEE Transactions on Dependable and Secure Computing}, note = {QC 20240704}, number = {4}, pages = {4084--4097}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Highly Available Blockchain Nodes With N-Version Design}, volume = {21}, DOI = {10.1109/TDSC.2023.3346195}, keywords = {availability, blockchain, Blockchains, Computer architecture, N-Version design, Peer-to-peer computing, Programming, Prototypes, Software, Time factors}, abstract = {As all software, blockchain nodes are exposed to faults in their underlying execution stack. Unstable execution environments can disrupt the availability of blockchain nodes' interfaces, resulting in downtime for users. This paper introduces the concept of N-Version Blockchain nodes. This new type of node relies on simultaneous execution of different implementations of the same blockchain protocol, in the line of Avizienis' N-Version programming vision. We design and implement an N-Version blockchain node prototype in the context of Ethereum, called N-ETH. We show that N-ETH is able to mitigate the effects of unstable execution environments and significantly enhance availability under environment faults. To simulate unstable execution environments, we perform fault injection at the system-call level. Our results show that existing Ethereum node implementations behave asymmetrically under identical instability scenarios. N-ETH leverages this asymmetric behavior available in the diverse implementations of Ethereum nodes to provide increased availability, even under our most aggressive fault-injection strategies. We are the first to validate the relevance of N-Version design in the domain of blockchain infrastructure. From an industrial perspective, our results are of utmost importance for businesses operating blockchain nodes, including Google, ConsenSys, and many other major blockchain companies. }, year = {2024} }