1#!/usr/bin/env perl 2 3# SPDX-License-Identifier: MIT 4# 5# Copyright (c) 2023, Rob Norris <robn@despairlabs.com> 6# 7# Permission is hereby granted, free of charge, to any person obtaining a copy 8# of this software and associated documentation files (the "Software"), to 9# deal in the Software without restriction, including without limitation the 10# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 11# sell copies of the Software, and to permit persons to whom the Software is 12# furnished to do so, subject to the following conditions: 13# 14# The above copyright notice and this permission notice shall be included in 15# all copies or substantial portions of the Software. 16# 17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23# IN THE SOFTWARE. 24 25 26# This program will update the AUTHORS file to include commit authors that are 27# in the git history but are not yet credited. 28# 29# The CONTRIBUTORS section of the AUTHORS file attempts to be a list of 30# individual contributors to OpenZFS, with one name, address and line per 31# person. This is good for readability, but does not really leave room for the 32# that names and emails on commits from the same individual can be different, 33# for all kinds of reasons, not limited to: 34# 35# - a person might change organisations, and so their email address changes 36# 37# - a person might be paid to work on OpenZFS for their employer, and then hack 38# on personal projects in the evening, so commits legitimately come from 39# different addresses 40# 41# - names change for all kinds of reasons 42# 43# To try and account for this, this program will try to find all the possible 44# names and emails for a single contributor, and then select the "best" one to 45# add to the AUTHORS file. 46# 47# The CONTRIBUTORS section of the AUTHORS file is considered the source of 48# truth. Once an individual committer is listed in there, that line will not be 49# removed regardless of what is discovered in the commit history. However, it 50# can't just be _anything_. The name or email still has to match something seen 51# in the commit history, so that we're able to undertand that its the same 52# contributor. 53# 54# The bulk of the work is in running `git log` to fetch commit author names and 55# emails. For each value, we generate a "slug" to use as an internal id for 56# that value, which is mostly just the lowercase of the value with whitespace 57# and punctuation removed. Two values with subtle differences can produce the 58# same slug, so at this point we also try to keep the "best" pre-slug value as 59# the display version. We use this slug to update two maps, one of email->name, 60# the other of name->email. 61# 62# Where possible, we also consider Signed-off-by: trailers in the commit 63# message, and if they match the commit author, enter them into the maps also. 64# Because a commit can contain multiple signoffs, we only track one if either 65# the name or the email address match the commit author (by slug). This is 66# mostly aimed at letting an explicit signoff override a generated name or 67# email on the same commit (usually a Github noreply), while avoiding every 68# signoff ever being treated as a possible canonical ident for some other 69# committer. (Also note that this behaviour only works for signoffs that can be 70# extracted with git-interpret-trailers, which misses many seen in the OpenZFS 71# git history, for various reasons). 72# 73# Once collected, we then walk all the emails we've seen and get all the names 74# associated with every instance. Then for each of those names, we get all the 75# emails associated, and so on until we've seen all the connected names and 76# emails. This collection is every possible name and email for an individual 77# contributor. 78# 79# Finaly, we consider these groups, and select the "best" name and email for 80# the contributor, and add them to the author tables if they aren't there 81# already. Once we've done everyone, we write out a new AUTHORS file, and 82# that's the whole job. 83# 84# This is imperfect! Its necessary for the user to examine the diff and make 85# sure its sensible. If it hasn't hooked up right, it may necessary to adjust 86# the input data (via .mailmap) or improve the heuristics in this program. It 87# took a long time to get into good shape when first written (355 new names 88# added to AUTHORS!) but hopefully in the future we'll be running this 89# regularly so it doesn't fall so far behind. 90 91 92use 5.010; 93use warnings; 94use strict; 95 96# Storage for the "best looking" version of name or email, keyed on slug. 97my %display_name; 98my %display_email; 99 100# First, we load the existing AUTHORS file. We save everything before 101# CONTRIBUTORS: line as-is so we can write it back out to the new file. Then 102# we extract name,email pairs from the remainder and store them in a pair of 103# hashtables, keyed on slug. 104my %authors_name; 105my %authors_email; 106 107my @authors_header; 108 109for my $line (do { local (@ARGV) = ('AUTHORS'); <> }) { 110 chomp $line; 111 state $in_header = 1; 112 if ($in_header) { 113 push @authors_header, $line; 114 $in_header = 0 if $line =~ m/^CONTRIBUTORS:/; 115 } else { 116 my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/; 117 next unless $name; 118 119 my $semail = email_slug($email); 120 my $sname = name_slug($name); 121 122 $authors_name{$semail} = $sname; 123 $authors_email{$sname} = $semail; 124 125 # The name/email in AUTHORS is already the "best looking" 126 # version, by definition. 127 $display_name{$sname} = $name; 128 $display_email{$semail} = $email; 129 } 130} 131 132# Next, we load all the commit authors and signoff pairs, and form name<->email 133# mappings, keyed on slug. Note that this format is getting the 134# .mailmap-converted form. This lets us control the input to some extent by 135# making changes there. 136my %seen_names; 137my %seen_emails; 138 139# The true email address from commits, by slug. We do this so we can generate 140# mailmap entries, which will only match the exact address from the commit, 141# not anything "prettified". This lets us remember the prefix part of Github 142# noreply addresses, while not including it in AUTHORS if that is truly the 143# best option we have. 144my %commit_email; 145 146for my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE:::%(trailers:key=signed-off-by,valueonly,separator=:::)')) { 147 chomp $line; 148 my ($name, $email, @signoffs) = split ':::', $line; 149 next unless $name && $email; 150 151 my $semail = email_slug($email); 152 my $sname = name_slug($name); 153 154 # Track the committer name and email. 155 $seen_names{$semail}{$sname} = 1; 156 $seen_emails{$sname}{$semail} = 1; 157 158 # Keep the original commit address. 159 $commit_email{$semail} = $email; 160 161 # Consider if these are the best we've ever seen. 162 update_display_name($name); 163 update_display_email($email); 164 165 # Check signoffs. any that have a matching name or email as the 166 # committer (by slug), also track them. 167 for my $signoff (@signoffs) { 168 my ($soname, $soemail) = $signoff =~ m/^([^<]+)\s+<(.+)>$/; 169 next unless $soname && $soemail; 170 my $ssoname = name_slug($soname); 171 my $ssoemail = email_slug($soemail); 172 if (($semail eq $ssoemail) ^ ($sname eq $ssoname)) { 173 $seen_names{$ssoemail}{$ssoname} = 1; 174 $seen_emails{$ssoname}{$ssoemail} = 1; 175 update_display_name($soname); 176 update_display_email($soemail); 177 } 178 } 179} 180 181# Now collect unique committers by all names+emails we've ever seen for them. 182# We start with emails and resolve all possible names, then we resolve the 183# emails for those names, and round and round until there's nothing left. 184my @committers; 185for my $start_email (sort keys %seen_names) { 186 # it might have been deleted already through a cross-reference 187 next unless $seen_names{$start_email}; 188 189 my %emails; 190 my %names; 191 192 my @check_emails = ($start_email); 193 my @check_names; 194 while (@check_emails || @check_names) { 195 while (my $email = shift @check_emails) { 196 next if $emails{$email}++; 197 push @check_names, 198 sort keys %{delete $seen_names{$email}}; 199 } 200 while (my $name = shift @check_names) { 201 next if $names{$name}++; 202 push @check_emails, 203 sort keys %{delete $seen_emails{$name}}; 204 } 205 } 206 207 # A "committer" is the collection of connected names and emails. 208 push @committers, [[sort keys %emails], [sort keys %names]]; 209} 210 211# Now we have our committers, we can work out what to add to AUTHORS. 212for my $committer (@committers) { 213 my ($emails, $names) = @$committer; 214 215 # If this commiter is already in AUTHORS, we must not touch. 216 next if grep { $authors_name{$_} } @$emails; 217 next if grep { $authors_email{$_} } @$names; 218 219 # Decide on the "best" name and email to use 220 my $email = best_email(@$emails); 221 my $name = best_name(@$names); 222 223 $authors_email{$name} = $email; 224 $authors_name{$email} = $name; 225 226 # We've now selected our canonical name going forward. If there 227 # were other options from commit authors only (not signoffs), 228 # emit mailmap lines for the user to past into .mailmap 229 my $cemail = $display_email{email_slug($authors_email{$name})}; 230 for my $alias (@$emails) { 231 next if $alias eq $email; 232 233 my $calias = $commit_email{$alias}; 234 next unless $calias; 235 236 my $cname = $display_name{$name}; 237 say "$cname <$cemail> <$calias>"; 238 } 239} 240 241# Now output the new AUTHORS file 242open my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n"; 243say $fh join("\n", @authors_header, ""); 244for my $name (sort keys %authors_email) { 245 my $cname = $display_name{$name}; 246 my $cemail = $display_email{email_slug($authors_email{$name})}; 247 say $fh " $cname <$cemail>"; 248} 249 250exit 0; 251 252# "Slugs" are used at the hashtable key for names and emails. They are used to 253# making two variants of a value be the "same" for matching. Mostly this is 254# to make upper and lower-case versions of a name or email compare the same, 255# but we do a little bit of munging to handle some common cases. 256# 257# Note that these are only used for matching internally; for display, the 258# slug will be used to look up the display form. 259sub name_slug { 260 my ($name) = @_; 261 262 # Remove spaces and dots, to handle differences in initials. 263 $name =~ s/[\s\.]//g; 264 265 return lc $name; 266} 267sub email_slug { 268 my ($email) = @_; 269 270 # Remove everything up to and including the first space, and the last 271 # space and everything after it. 272 $email =~ s/^(.*\s+)|(\s+.*)$//g; 273 274 # Remove the leading userid+ on Github noreply addresses. They're 275 # optional and we want to treat them as the same thing. 276 $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; 277 278 return lc $email; 279} 280 281# As we accumulate new names and addresses, record the "best looking" version 282# of each. Once we decide to add a committer to AUTHORS, we'll take the best 283# version of their name and address from here. 284# 285# Note that we don't record them if they're already in AUTHORS (that is, in 286# %authors_name or %authors_email) because that file already contains the 287# "best" version, by definition. So we return immediately if we've seen it 288# there already. 289sub update_display_name { 290 my ($name) = @_; 291 my $sname = name_slug($name); 292 return if $authors_email{$sname}; 293 294 # For names, "more specific" means "has more non-lower-case characters" 295 # (in ASCII), guessing that if a person has gone to some effort to 296 # specialise their name in a later commit, they presumably care more 297 # about it. If this is wrong, its probably better to add a .mailmap 298 # entry. 299 300 my $cname = $display_name{$sname}; 301 if (!$cname || 302 ($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) { 303 $display_name{$sname} = $name; 304 } 305} 306sub update_display_email { 307 my ($email) = @_; 308 my $semail = email_slug($email); 309 return if $authors_name{$semail}; 310 311 # Like names, we prefer uppercase when possible. We also remove any 312 # leading "plus address" for Github noreply addresses. 313 314 $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; 315 316 my $cemail = $display_email{$semail}; 317 if (!$cemail || 318 ($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) { 319 $display_email{$semail} = $email; 320 } 321} 322 323sub best_name { 324 my @names = sort { 325 my $cmp; 326 my ($aa) = $display_name{$a}; 327 my ($bb) = $display_name{$b}; 328 329 # The "best" name is very subjective, and a simple sort 330 # produced good-enough results, so I didn't try harder. Use of 331 # accented characters, punctuation and caps are probably an 332 # indicator of "better", but possibly we should also take into 333 # account the most recent name we saw, in case the committer 334 # has changed their name or nickname or similar. 335 # 336 # Really, .mailmap is the place to control this. 337 338 return ($aa cmp $bb); 339 } @_; 340 341 return shift @names; 342} 343sub best_email { 344 state $internal_re = qr/\.(?:internal|local|\(none\))$/; 345 state $noreply_re = qr/\.noreply\.github\.com$/; 346 state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/; 347 348 my @emails = sort { 349 my $cmp; 350 351 # prefer address with a single @ over those without 352 $cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1); 353 return $cmp unless $cmp == 0; 354 355 # prefer any address over internal/local addresses 356 $cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re)); 357 return $cmp unless $cmp == 0; 358 359 # prefer any address over github noreply aliases 360 $cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re)); 361 return $cmp unless $cmp == 0; 362 363 # prefer any address over freemail providers 364 $cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re)); 365 return $cmp unless $cmp == 0; 366 367 # alphabetical by domain 368 my ($alocal, $adom) = split /\@/, $a; 369 my ($blocal, $bdom) = split /\@/, $b; 370 $cmp = ($adom cmp $bdom); 371 return $cmp unless $cmp == 0; 372 373 # alphabetical by local part 374 return ($alocal cmp $blocal); 375 } @_; 376 377 return shift @emails; 378} 379