pubs.bib

@article{9904025,
  author = {Dey, Tapajit and Jiang, Willem and Fitzgerald, Brian},
  journal = {IEEE Software},
  title = {Knights and Gold Stars: A Tale of InnerSource Incentivization},
  year = {2022},
  volume = {},
  number = {},
  pages = {2-12},
  doi = {10.1109/MS.2022.3192647}
}
@article{mahmoud2022one,
  title = {One-off events? An empirical study of hackathon code creation and reuse},
  author = {Mahmoud, Ahmed Samir Imam and Dey, Tapajit and Nolte, Alexander and Mockus, Audris and Herbsleb, James D},
  journal = {Empirical Software Engineering},
  volume = {27},
  number = {7},
  pages = {1--49},
  year = {2022},
  publisher = {Springer}
}
@inproceedings{dey2021skillspace,
  author = {Dey, Tapajit and Karnauch, Andrey and Mockus, Audris},
  booktitle = {2021 IEEE/ACM 43rd International Conference on Software Engineering (ICSE)},
  title = {Representation of Developer Expertise in Open Source Software},
  year = {2021},
  volume = {},
  number = {},
  pages = {995-1007},
  doi = {10.1109/ICSE43902.2021.00094}
}
@inproceedings{imam2021secret,
  author = {Imam, Ahmed and Dey, Tapajit and Nolte, Alexander and Mockus, Audris and Herbsleb, James D.},
  booktitle = {2021 IEEE/ACM 18th International Conference on Mining Software Repositories (MSR)},
  title = {The Secret Life of Hackathon Code Where does it come from and where does it go?},
  year = {2021},
  volume = {},
  number = {},
  pages = {68-79},
  doi = {10.1109/MSR52588.2021.00020}
}
@inproceedings{imam2021tracking,
  author = {Imam, Ahmed and Dey, Tapajit},
  booktitle = {2021 IEEE/ACM 18th International Conference on Mining Software Repositories (MSR)},
  title = {Tracking Hackathon Code Creation and Reuse},
  year = {2021},
  volume = {},
  number = {},
  pages = {615-617},
  doi = {10.1109/MSR52588.2021.00085}
}
@article{ma2021world,
  title = {World of code: enabling a research workflow for mining and analyzing the universe of open source VCS data},
  author = {Ma, Yuxing and Dey, Tapajit and Bogart, Chris and Amreen, Sadika and Valiev, Marat and Tutko, Adam and Kennard, David and Zaretzki, Russell and Mockus, Audris},
  journal = {Empirical Software Engineering},
  volume = {26},
  number = {2},
  pages = {1--42},
  year = {2021},
  publisher = {Springer},
  url = {https://doi.org/10.1007/s10664-020-09905-9}
}
@inproceedings{dey2020pullrequest,
  author = {Dey, Tapajit and Mockus, Audris},
  title = {Effect of Technical and Social Factors on Pull Request Quality for the NPM Ecosystem},
  year = {2020},
  isbn = {9781450375801},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3382494.3410685},
  doi = {10.1145/3382494.3410685},
  abstract = {Background: Pull request (PR) based development, which is a norm for the social coding platforms, entails the challenge of evaluating the contributions of, often unfamiliar, developers from across the open source ecosystem and, conversely, submitting a contribution to a project with unfamiliar maintainers. Previous studies suggest that the decision of accepting or rejecting a PR may be influenced by a diverging set of technical and social factors, but often focus on relatively few projects, do not consider ecosystem-wide measures, or the possible non-monotonic relationships between the predictors and PR acceptance probability. Aim: We aim to shed light on this important decision making process by testing which measures significantly affect the probability of PR acceptance on a significant fraction of a large ecosystem, rank them by their relative importance in predicting PR acceptance, and determine the shape of the functions that map each predictor to PR acceptance. Method: We proposed seven hypotheses regarding which technical and social factors might affect PR acceptance and created 17 measures based on them. Our dataset consisted of 470,925 PRs from 3349 popular NPM packages and 79,128 GitHub users who created those. We tested which of the measures affect PR acceptance and ranked the significant measures by their importance in a predictive model. Results: Our predictive model had and AUC of 0.94, and 15 of the 17 measures were found to matter, including five novel ecosystem-wide measures. Measures describing the number of PRs submitted to a repository and what fraction of those get accepted, and signals about the PR review phase were most significant. We also discovered that only four predictors have a linear influence on the PR acceptance probability while others showed a more complicated response. Conclusion: Our findings should be helpful for PR creators, integrators, as well as tool designers to focus on the important factors affecting PR acceptance.},
  booktitle = {Proceedings of the 14th ACM / IEEE International Symposium on Empirical Software Engineering and Measurement (ESEM)},
  articleno = {11},
  numpages = {11},
  keywords = {Predictive Model, Social Factors, NPM Packages, Pull Request},
  location = {Bari, Italy},
  series = {ESEM '20}
}
@inproceedings{dey2020botdetection,
  author = {Dey, Tapajit and Mousavi, Sara and Ponce, Eduardo and Fry, Tanner and Vasilescu, Bogdan and Filippova, Anna and Mockus, Audris},
  title = {Detecting and Characterizing Bots That Commit Code},
  year = {2020},
  isbn = {9781450375177},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3379597.3387478},
  doi = {10.1145/3379597.3387478},
  abstract = {Background: Some developer activity traditionally performed manually, such as making code commits, opening, managing, or closing issues is increasingly subject to automation in many OSS projects. Specifically, such activity is often performed by tools that react to events or run at specific times. We refer to such automation tools as bots and, in many software mining scenarios related to developer productivity or code quality, it is desirable to identify bots in order to separate their actions from actions of individuals. Aim: Find an automated way of identifying bots and code committed by these bots, and to characterize the types of bots based on their activity patterns. Method and Result: We propose BIMAN, a systematic approach to detect bots using author names, commit messages, files modified by the commit, and projects associated with the commits. For our test data, the value for AUC-ROC was 0.9. We also characterized these bots based on the time patterns of their code commits and the types of files modified, and found that they primarily work with documentation files and web pages, and these files are most prevalent in HTML and JavaScript ecosystems. We have compiled a shareable dataset containing detailed information about 461 bots we found (all of which have more than 1000 commits) and 13,762,430 commits they created.},
  booktitle = {Proceedings of the 17th International Conference on Mining Software Repositories},
  pages = {209–219},
  numpages = {11},
  keywords = {random forest, ensemble model, software engineering, social coding platforms, automated commits, bots},
  location = {Seoul, Republic of Korea},
  series = {MSR '20}
}
@inproceedings{dey2020botse,
  author = {Dey, Tapajit and Vasilescu, Bogdan and Mockus, Audris},
  title = {An Exploratory Study of Bot Commits},
  year = {2020},
  isbn = {9781450379632},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3387940.3391502},
  doi = {10.1145/3387940.3391502},
  abstract = {Background: Bots help automate many of the tasks performed by software developers and are widely used to commit code in various social coding platforms. At present, it is not clear what types of activities these bots perform and understanding it may help design better bots, and find application areas which might benefit from bot adoption. Aim: We aim to categorize the Bot Commits by the type of change (files added, deleted, or modified), find the more commonly changed file types, and identify the groups of file types that tend to get updated together. Method: 12,326,137 commits made by 461 popular bots (that made at least 1000 commits) were examined to identify the frequency and the type of files added/ deleted/ modified by the commits, and association rule mining was used to identify the types of files modified together. Result: Majority of the bot commits modify an existing file, a few of them add new files, while deletion of a file is very rare. Commits involving more than one type of operation are even rarer. Files containing data, configuration, and documentation are most frequently updated, while HTML is the most common type in terms of the number of files added, deleted, and modified. Files of the type "Markdown","Ignore List", "YAML", "JSON" were the types that are updated together with other types of files most frequently. Conclusion: We observe that majority of bot commits involve single file modifications, and bots primarily work with data, configuration, and documentation files. A better understanding if this is a limitation of the bots and, if overcome, would lead to different kinds of bots remains an open question.},
  booktitle = {Proceedings of the IEEE/ACM 42nd International Conference on Software Engineering Workshops},
  pages = {61–65},
  numpages = {5},
  keywords = {Code Commits, social coding platforms, Bots, Automated Commits},
  location = {Seoul, Republic of Korea},
  series = {ICSEW'20}
}
@inproceedings{fry2020idres,
  author = {Fry, Tanner and Dey, Tapajit and Karnauch, Andrey and Mockus, Audris},
  title = {A Dataset and an Approach for Identity Resolution of 38 Million Author IDs Extracted from 2B Git Commits},
  year = {2020},
  isbn = {9781450375177},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3379597.3387500},
  doi = {10.1145/3379597.3387500},
  abstract = {The data collected from open source projects provide means to model large software ecosystems, but often suffer from data quality issues, specifically, multiple author identification strings in code commits might actually be associated with one developer. While many methods have been proposed for addressing this problem, they are either heuristics requiring manual tweaking, or require too much calculation time to do pairwise comparisons for 38M author IDs in, for example, the World of Code collection. In this paper, we propose a method that finds all author IDs belonging to a single developer in this entire dataset, and share the list of all author IDs that were found to have aliases. To do this, we first create blocks of potentially connected author IDs and then use a machine learning model to predict which of these potentially related IDs belong to the same developer. We processed around 38 million author IDs and found around 14.8 million IDs to have an alias, which belong to 5.4 million different developers, with the median number of aliases being 2 per developer. This dataset can be used to create more accurate models of developer behaviour at the entire OSS ecosystem level and can be used to provide a service to rapidly resolve new author IDs.},
  booktitle = {Proceedings of the 17th International Conference on Mining Software Repositories},
  pages = {518–522},
  numpages = {5},
  keywords = {Identity Resolution, Heuristics, Machine Learning, Data Sharing, Git Commits},
  location = {Seoul, Republic of Korea},
  series = {MSR '20}
}
@article{Krutauz2020codereview,
  author = {Krutauz, Andrey
and Dey, Tapajit
and Rigby, Peter C.
and Mockus, Audris},
  title = {Do code review measures explain the incidence of post-release defects?},
  journal = {Empirical Software Engineering},
  year = {2020},
  month = {Sep},
  day = {01},
  volume = {25},
  number = {5},
  pages = {3323-3356},
  abstract = {In contrast to studies of defects found during code review, we aim to clarify whether code review measures can explain the prevalence of post-release defects.},
  issn = {1573-7616},
  doi = {10.1007/s10664-020-09837-4},
  url = {https://doi.org/10.1007/s10664-020-09837-4}
}
@article{Dey2020qualityEMSE,
  author = {Dey, Tapajit
and Mockus, Audris},
  title = {Deriving a usage-independent software quality metric},
  journal = {Empirical Software Engineering},
  year = {2020},
  month = {Mar},
  day = {01},
  volume = {25},
  number = {2},
  pages = {1596-1641},
  abstract = {The extent of post-release use of software affects the number of faults, thus biasing quality metrics and adversely affecting associated decisions. The proprietary nature of usage data limited deeper exploration of this subject in the past.},
  issn = {1573-7616},
  doi = {10.1007/s10664-019-09791-w},
  url = {https://doi.org/10.1007/s10664-019-09791-w}
}
@inproceedings{dey2019patterns,
  author = {Dey, Tapajit and Ma, Yuxing and Mockus, Audris},
  title = {Patterns of Effort Contribution and Demand and User Classification Based on Participation Patterns in NPM Ecosystem},
  year = {2019},
  isbn = {9781450372336},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3345629.3345634},
  doi = {10.1145/3345629.3345634},
  abstract = {Background: Open source requires participation of volunteer and commercial developers (users) in order to deliver functional high-quality components. Developers both contribute effort in the form of patches and demand effort from the component maintainers to resolve issues reported against it. Open source components depend on each other directly and transitively, and evidence suggests that more effort is required for reporting and resolving the issues reported further upstream in this supply chain. Aim: Identify and characterize patterns of effort contribution and demand throughout the open source supply chain and investigate if and how these patterns vary with developer activity; identify different groups of developers; and predict developers' company affiliation based on their participation patterns. Method: 1,376,946 issues and pull-requests created for 4433 NPM packages with over 10,000 monthly downloads and full (public) commit activity data of the 272,142 issue creators is obtained and analyzed and dependencies on NPM packages are identified. Fuzzy c-means clustering algorithm is used to find the groups among the users based on their effort contribution and demand patterns, and Random Forest is used as the predictive modeling technique to identify their company affiliations. Result: Users contribute and demand effort primarily from packages that they depend on directly with only a tiny fraction of contributions and demand going to transitive dependencies. A significant portion of demand goes into packages outside the users' respective supply chains (constructed based on publicly visible version control data). Three and two different groups of users are observed based on the effort demand and effort contribution patterns respectively. The Random Forest model used for identifying the company affiliation of the users gives a AUC-ROC value of 0.68, and variables representing aggregate participation patterns proved to be the important predictors. Conclusion: Our results give new insights into effort demand and supply at different parts of the supply chain of the NPM ecosystem and its users and suggests the need to increase visibility further upstream.},
  booktitle = {Proceedings of the Fifteenth International Conference on Predictive Models and Data Analytics in Software Engineering},
  pages = {36–45},
  numpages = {10},
  keywords = {Random Forest model, NPM Packages, Software Issue Reporting, Clustering, Software Dependencies, User Contribution},
  location = {Recife, Brazil},
  series = {PROMISE'19}
}
@inbook{Amreen2019bookchapter,
  author = {Amreen, Sadika
and Bichescu, Bogdan
and Bradley, Randy
and Dey, Tapajit
and Ma, Yuxing
and Mockus, Audris
and Mousavi, Sara
and Zaretzki, Russell},
  editor = {Fitzgerald, Brian
and Mockus, Audris
and Zhou, Minghui},
  title = {A Methodology for Measuring FLOSS Ecosystems},
  booktitle = {Towards Engineering Free/Libre Open Source Software (FLOSS) Ecosystems for Impact and Sustainability: Communications of NII Shonan Meetings},
  year = {2019},
  publisher = {Springer Singapore},
  address = {Singapore},
  pages = {1--29},
  abstract = {FLOSS ecosystem as a whole is a critical component of world's computing infrastructure, yet not well understood. In order to understand it well, we need to measure it first. We, therefore, aim to provide a framework for measuring key aspects of the entire FLOSS ecosystem. We first consider the FLOSS ecosystem through lens of a supply chain. The concept of supply chain is the existence of series of interconnected parties/affiliates each contributing unique elements and expertise so as to ensure a final solution is accessible to all interested parties. This perspective has been extremely successful in helping allowing companies to cope with multifaceted risks caused by the distributed decision-making in their supply chains, especially as they have become more global. Software ecosystems, similarly, represent distributed decisions in supply chains of code and author contributions, suggesting that relationships among projects, developers, and source code have to be measured. We then describe a massive measurement infrastructure involving discovery, extraction, cleaning, correction, and augmentation of publicly available open-source data from version control systems and other sources. We then illustrate how the key relationships among the nodes representing developers, projects, changes, and files can be accurately measured, how to handle absence of measures for user base in version control data, and, finally, illustrate how such measurement infrastructure can be used to increase knowledge resilience in FLOSS.},
  isbn = {978-981-13-7099-1},
  doi = {10.1007/978-981-13-7099-1_1},
  url = {https://doi.org/10.1007/978-981-13-7099-1_1}
}
@inproceedings{dey2018usageQuality,
  author = {Dey, Tapajit and Mockus, Audris},
  title = {Modeling Relationship between Post-Release Faults and Usage in Mobile Software},
  year = {2018},
  isbn = {9781450365932},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3273934.3273941},
  doi = {10.1145/3273934.3273941},
  abstract = {Background: The way post-release usage of a software affects the number of faults experienced by users is scarcely explored due to the proprietary nature of such data. The commonly used quality measure of post-release faults may, therefore, reflect usage instead of the quality of the software development process. Aim: To determine how software faults and software use are related in a post-deployment scenario and, based on that, derive post-deployment quality measure that reflects developers' performance more accurately. Method: We analyze Google Analytics data counting daily new users, visits, time-on-site, visits per user, and release start date and duration for 169 releases of a complex communication application for Android OS. We utilize Linear Regression, Bayesian Network, and Random Forest models to explain the interrelationships and to derive release quality measure that is relatively stable with respect to variations in software usage. Results: We found the number of new users and release start date to be the determining factors for the number of exceptions, and found no direct link between the intensity and frequency of software usage and software faults. Furthermore, the relative increase in the number of crashes was found to be stably associated with a power of 1.3 relative increase in the number of new users. Based on the findings we propose a release quality measure: number of crashes per user for a release of the software, which was seen to be independent of any other usage variables, providing us with a usage independent measure of software quality. Conclusions: We expect our result and our proposed quality measure will help gauge release quality of a software more accurately and inspire further research in this area.},
  booktitle = {Proceedings of the 14th International Conference on Predictive Models and Data Analytics in Software Engineering},
  pages = {56–65},
  numpages = {10},
  keywords = {Bayesian Networks, Software Quality, Random Forest, Linear Regression, Software Usage, Software Faults},
  location = {Oulu, Finland},
  series = {PROMISE'18}
}
@inproceedings{dey2018dependency,
  author = {Dey, Tapajit and Mockus, Audris},
  title = {Are Software Dependency Supply Chain Metrics Useful in Predicting Change of Popularity of NPM Packages?},
  year = {2018},
  isbn = {9781450365932},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3273934.3273942},
  doi = {10.1145/3273934.3273942},
  abstract = {Background: As software development becomes more interdependent, unique relationships among software packages arise and form complex software ecosystems. Aim: We aim to understand the behavior of these ecosystems better through the lens of software supply chains and model how the effects of software dependency network affect the change in downloads of Javascript packages. Method: We analyzed 12,999 popular packages in NPM, between 01-December-2017 and 15-March-2018, using Linear Regression and Random Forest models and examined the effects of predictors representing different aspects of the software dependency supply chain on changes in numbers of downloads for a package. Result: Preliminary results suggest that the count and downloads of upstream and downstream runtime dependencies have a strong effect on the change in downloads, with packages having fewer, more popular packages as dependencies (upstream or downstream) likely to see an increase in downloads. This suggests that in order to interpret the number of downloads for a package properly, it is necessary to take into account the peculiarities of the supply chain (both upstream and downstream) of that package. Conclusion: Future work is needed to identify the effects of added, deleted, and unchanged dependencies for different types of packages, e.g. build tools, test tools.},
  booktitle = {Proceedings of the 14th International Conference on Predictive Models and Data Analytics in Software Engineering},
  pages = {66–69},
  numpages = {4},
  keywords = {Software Supply Chain, Software Dependency, Open Source, Software Popularity, NPM Packages},
  location = {Oulu, Finland},
  series = {PROMISE'18}
}
@inproceedings{dey2016mods,
  author = {Dey, Tapajit and Massengill, Jacob Logan and Mockus, Audris},
  title = {Analysis of Popularity of Game Mods: A Case Study},
  year = {2016},
  isbn = {9781450344586},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/2968120.2987724},
  doi = {10.1145/2968120.2987724},
  abstract = {Video game mods have become an integral part of the gameplay experience for PC gamers and have drawn support of major game development companies. This area, nevertheless, is not well understood, especially in terms of what motivates mod creators and users. To explore this question we propose a data-driven approach that relies on data that can be obtained from online fora devoted for game mods. To illustrate our approach, we have collected data on deployment and popularity for the mods in six major PC games made by two companies and spanning more than a decade from different popular mod hosting websites. In particular, we investigate what features are present in the mods that are being developed and what features are popular among mod users. To accomplish that, we propose measures of popularity of the mods based on the number of unique downloads and define mod feature space by the tags associated with each mod. The preliminary investigation suggests, for example, that the features developed across the distinct games to be similar but the sets of features popular among users were only marginally similar to the sets of features being implemented by mod creators. We plan surveys of mod users and mod creators to determine causes for this discrepancy. We hope that our approach would allow answering important research and practical questions in the area of game mod development.},
  booktitle = {Proceedings of the 2016 Annual Symposium on Computer-Human Interaction in Play Companion Extended Abstracts},
  pages = {133–139},
  numpages = {7},
  keywords = {tags, mod features, popularity, game mods},
  location = {Austin, Texas, USA},
  series = {CHI PLAY Companion '16}
}
@inproceedings{ma2016,
  author = {Ma, Yuxing and Dey, Tapajit and Mockus, Audris},
  title = {Modularizing Global Variable in Climate Simulation Software: Position Paper},
  year = {2016},
  isbn = {9781450341677},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/2897676.2897681},
  doi = {10.1145/2897676.2897681},
  abstract = {In large-scale simulation codes, such as climate models, variables represent a large number of characteristics of earth surface and atmosphere for a single multi-dimensional cell and are distributed over a multitude of cores in the supercomputers where these simulations run. The hundreds of variables allow different parts of simulation representing specific sub-models, for example the photosynthesis, to interact with other sub-models of the simulation.The scientists of each domain write the simulation code for the sub-model representing their sub-domain. To integrate their code into the entire simulation, they need to deal with hundreds of unfamiliar variables of which only a small subset is relevant to their work. Designing such variables in a modular fashion, so that the scientists could interact only with the variables relevant to their sub-model is likely to increase the productivity of the scientists and to increase accuracy of the simulation codes.A natural way to group the variables into modules is by using a language feature that group them together, such as, struct construct in C language or a class in C++ language. Each scientist would then need to familiarize themselves with only a small subset of modules that contain variables used in their simulations. For example, Community Earth System Model (CESM) code v1.06 has 51 such modules (structures) that contain 1479 variables. The methods proposed below can be used to assess the modularity of the existing set of structures and to generate alternative modularizations that improve upon it.In a nutshell, the approach minimizes the number of variables exposed to other domains by the modules used in each domain.},
  booktitle = {Proceedings of the International Workshop on Software Engineering for Science},
  pages = {8–11},
  numpages = {4},
  keywords = {data modularization, developer productivity, climate simulation},
  location = {Austin, Texas},
  series = {SE4Science '16}
}

This file was generated by bibtex2html 1.99.