1#!/usr/bin/env perl 2 3# SPDX-License-Identifier: MIT 4# 5# Copyright (c) 2023, Rob Norris <robn@despairlabs.com> 6# 7# Permission is hereby granted, free of charge, to any person obtaining a copy 8# of this software and associated documentation files (the "Software"), to 9# deal in the Software without restriction, including without limitation the 10# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 11# sell copies of the Software, and to permit persons to whom the Software is 12# furnished to do so, subject to the following conditions: 13# 14# The above copyright notice and this permission notice shall be included in 15# all copies or substantial portions of the Software. 16# 17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23# IN THE SOFTWARE. 24 25 26# This program will update the AUTHORS file to include commit authors that are 27# in the git history but are not yet credited. 28# 29# The CONTRIBUTORS section of the AUTHORS file attempts to be a list of 30# individual contributors to OpenZFS, with one name, address and line per 31# person. This is good for readability, but does not really leave room for the 32# that names and emails on commits from the same individual can be different, 33# for all kinds of reasons, not limited to: 34# 35# - a person might change organisations, and so their email address changes 36# 37# - a person might be paid to work on OpenZFS for their employer, and then hack 38# on personal projects in the evening, so commits legitimately come from 39# different addresses 40# 41# - names change for all kinds of reasons 42# 43# To try and account for this, this program will try to find all the possible 44# names and emails for a single contributor, and then select the "best" one to 45# add to the AUTHORS file. 46# 47# The CONTRIBUTORS section of the AUTHORS file is considered the source of 48# truth. Once an individual committer is listed in there, that line will not be 49# removed regardless of what is discovered in the commit history. However, it 50# can't just be _anything_. The name or email still has to match something seen 51# in the commit history, so that we're able to undertand that its the same 52# contributor. 53# 54# The bulk of the work is in running `git log` to fetch commit author names and 55# emails. For each value, we generate a "slug" to use as an internal id for 56# that value, which is mostly just the lowercase of the value with whitespace 57# and punctuation removed. Two values with subtle differences can produce the 58# same slug, so at this point we also try to keep the "best" pre-slug value as 59# the display version. We use this slug to update two maps, one of email->name, 60# the other of name->email. 61# 62# Once collected, we then walk all the emails we've seen and get all the names 63# associated with every instance. Then for each of those names, we get all the 64# emails associated, and so on until we've seen all the connected names and 65# emails. This collection is every possible name and email for an individual 66# contributor. 67# 68# Finaly, we consider these groups, and select the "best" name and email for 69# the contributor, and add them to the author tables if they aren't there 70# already. Once we've done everyone, we write out a new AUTHORS file, and 71# that's the whole job. 72# 73# This is imperfect! Its necessary for the user to examine the diff and make 74# sure its sensible. If it hasn't hooked up right, it may necessary to adjust 75# the input data (via .mailmap) or improve the heuristics in this program. It 76# took a long time to get into good shape when first written (355 new names 77# added to AUTHORS!) but hopefully in the future we'll be running this 78# regularly so it doesn't fall so far behind. 79 80 81use 5.010; 82use warnings; 83use strict; 84 85# Storage for the "best looking" version of name or email, keyed on slug. 86my %display_name; 87my %display_email; 88 89# First, we load the existing AUTHORS file. We save everything before 90# CONTRIBUTORS: line as-is so we can write it back out to the new file. Then 91# we extract name,email pairs from the remainder and store them in a pair of 92# hashtables, keyed on slug. 93my %authors_name; 94my %authors_email; 95 96my @authors_header; 97 98for my $line (do { local (@ARGV) = ('AUTHORS'); <> }) { 99 chomp $line; 100 state $in_header = 1; 101 if ($in_header) { 102 push @authors_header, $line; 103 $in_header = 0 if $line =~ m/^CONTRIBUTORS:/; 104 } else { 105 my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/; 106 next unless $name; 107 108 my $semail = email_slug($email); 109 my $sname = name_slug($name); 110 111 $authors_name{$semail} = $sname; 112 $authors_email{$sname} = $semail; 113 114 # The name/email in AUTHORS is already the "best looking" 115 # version, by definition. 116 $display_name{$sname} = $name; 117 $display_email{$semail} = $email; 118 } 119} 120 121# Next, we load all the commit authors. and form name<->email mappings, keyed 122# on slug. Note that this format is getting the .mailmap-converted form. This 123# lets us control the input to some extent by making changes there. 124my %git_names; 125my %git_emails; 126 127for my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE')) { 128 chomp $line; 129 my ($name, $email) = $line =~ m/^(.*):::(.*)/; 130 next unless $name && $email; 131 132 my $semail = email_slug($email); 133 my $sname = name_slug($name); 134 135 $git_names{$semail}{$sname} = 1; 136 $git_emails{$sname}{$semail} = 1; 137 138 # Update the "best looking" display value, but only if we don't already 139 # have something from the AUTHORS file. If we do, we must not change it. 140 if (!$authors_name{email_slug($email)}) { 141 update_display_email($email); 142 } 143 144 if (!$authors_email{name_slug($name)}) { 145 update_display_name($name); 146 } 147} 148 149# Now collect unique committers by all names+emails we've ever seen for them. 150# We start with emails and resolve all possible names, then we resolve the 151# emails for those names, and round and round until there's nothing left. 152my @committers; 153for my $start_email (sort keys %git_names) { 154 # it might have been deleted already through a cross-reference 155 next unless $git_names{$start_email}; 156 157 my %emails; 158 my %names; 159 160 my @check_emails = ($start_email); 161 my @check_names; 162 while (@check_emails || @check_names) { 163 while (my $email = shift @check_emails) { 164 next if $emails{$email}++; 165 push @check_names, 166 sort keys %{delete $git_names{$email}}; 167 } 168 while (my $name = shift @check_names) { 169 next if $names{$name}++; 170 push @check_emails, 171 sort keys %{delete $git_emails{$name}}; 172 } 173 } 174 175 # A "committer" is the collection of connected names and emails. 176 push @committers, [[sort keys %emails], [sort keys %names]]; 177} 178 179# Now we have our committers, we can work out what to add to AUTHORS. 180for my $committer (@committers) { 181 my ($emails, $names) = @$committer; 182 183 # If this commiter is already in AUTHORS, we must not touch. 184 next if grep { $authors_name{$_} } @$emails; 185 next if grep { $authors_email{$_} } @$names; 186 187 # Decide on the "best" name and email to use 188 my $email = best_email(@$emails); 189 my $name = best_name(@$names); 190 191 $authors_email{$name} = $email; 192 $authors_name{$email} = $name; 193} 194 195# Now output the new AUTHORS file 196open my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n"; 197#my $fh = \*STDOUT; 198say $fh join("\n", @authors_header, ""); 199for my $name (sort keys %authors_email) { 200 my $cname = $display_name{$name}; 201 my $cemail = $display_email{email_slug($authors_email{$name})}; 202 say $fh " $cname <$cemail>"; 203} 204 205exit 0; 206 207# "Slugs" are used at the hashtable key for names and emails. They are used to 208# making two variants of a value be the "same" for matching. Mostly this is 209# to make upper and lower-case versions of a name or email compare the same, 210# but we do a little bit of munging to handle some common cases. 211# 212# Note that these are only used for matching internally; for display, the 213# slug will be used to look up the display form. 214sub name_slug { 215 my ($name) = @_; 216 217 # Remove spaces and dots, to handle differences in initials. 218 $name =~ s/[\s\.]//g; 219 220 return lc $name; 221} 222sub email_slug { 223 my ($email) = @_; 224 225 # Remove everything up to and including the first space, and the last 226 # space and everything after it. 227 $email =~ s/^(.*\s+)|(\s+.*)$//g; 228 229 # Remove the leading userid+ on Github noreply addresses. They're 230 # optional and we want to treat them as the same thing. 231 $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; 232 233 return lc $email; 234} 235 236sub update_display_name { 237 my ($name) = @_; 238 my $sname = name_slug($name); 239 240 # For names, "more specific" means "has more non-lower-case characters" 241 # (in ASCII), guessing that if a person has gone to some effort to 242 # specialise their name in a later commit, they presumably care more 243 # about it. If this is wrong, its probably better to add a .mailmap 244 # entry. 245 246 my $cname = $display_name{$sname}; 247 if (!$cname || 248 ($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) { 249 $display_name{$sname} = $name; 250 } 251} 252sub update_display_email { 253 my ($email) = @_; 254 my $semail = email_slug($email); 255 256 # Like names, we prefer uppercase when possible. We also remove any 257 # leading "plus address" for Github noreply addresses. 258 $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; 259 260 my $cemail = $display_email{$semail}; 261 if (!$cemail || 262 ($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) { 263 $display_email{$semail} = $email; 264 } 265} 266 267sub best_name { 268 my @names = sort { 269 my $cmp; 270 my ($aa) = $display_name{$a}; 271 my ($bb) = $display_name{$b}; 272 273 # The "best" name is very subjective, and a simple sort 274 # produced good-enough results, so I didn't try harder. Use of 275 # accented characters, punctuation and caps are probably an 276 # indicator of "better", but possibly we should also take into 277 # account the most recent name we saw, in case the committer 278 # has changed their name or nickname or similar. 279 # 280 # Really, .mailmap is the place to control this. 281 282 return ($aa cmp $bb); 283 } @_; 284 285 return shift @names; 286} 287sub best_email { 288 state $internal_re = qr/\.(?:internal|local|\(none\))$/; 289 state $noreply_re = qr/\.noreply\.github\.com$/; 290 state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/; 291 292 my @emails = sort { 293 my $cmp; 294 295 # prefer address with a single @ over those without 296 $cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1); 297 return $cmp unless $cmp == 0; 298 299 # prefer any address over internal/local addresses 300 $cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re)); 301 return $cmp unless $cmp == 0; 302 303 # prefer any address over github noreply aliases 304 $cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re)); 305 return $cmp unless $cmp == 0; 306 307 # prefer any address over freemail providers 308 $cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re)); 309 return $cmp unless $cmp == 0; 310 311 # alphabetical by domain 312 my ($alocal, $adom) = split /\@/, $a; 313 my ($blocal, $bdom) = split /\@/, $b; 314 $cmp = ($adom cmp $bdom); 315 return $cmp unless $cmp == 0; 316 317 # alphabetical by local part 318 return ($alocal cmp $blocal); 319 } @_; 320 321 return shift @emails; 322} 323