xref: /freebsd/sys/contrib/openzfs/scripts/update_authors.pl (revision b1c1ee4429fcca8f69873a8be66184e68e1b19d7)
123cf27dbSMartin Matuska#!/usr/bin/env perl
223cf27dbSMartin Matuska
323cf27dbSMartin Matuska# SPDX-License-Identifier: MIT
423cf27dbSMartin Matuska#
523cf27dbSMartin Matuska# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
623cf27dbSMartin Matuska#
723cf27dbSMartin Matuska# Permission is hereby granted, free of charge, to any person obtaining a copy
823cf27dbSMartin Matuska# of this software and associated documentation files (the "Software"), to
923cf27dbSMartin Matuska# deal in the Software without restriction, including without limitation the
1023cf27dbSMartin Matuska# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
1123cf27dbSMartin Matuska# sell copies of the Software, and to permit persons to whom the Software is
1223cf27dbSMartin Matuska# furnished to do so, subject to the following conditions:
1323cf27dbSMartin Matuska#
1423cf27dbSMartin Matuska# The above copyright notice and this permission notice shall be included in
1523cf27dbSMartin Matuska# all copies or substantial portions of the Software.
1623cf27dbSMartin Matuska#
1723cf27dbSMartin Matuska# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1823cf27dbSMartin Matuska# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1923cf27dbSMartin Matuska# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2023cf27dbSMartin Matuska# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2123cf27dbSMartin Matuska# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2223cf27dbSMartin Matuska# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2323cf27dbSMartin Matuska# IN THE SOFTWARE.
2423cf27dbSMartin Matuska
2523cf27dbSMartin Matuska
2623cf27dbSMartin Matuska# This program will update the AUTHORS file to include commit authors that are
2723cf27dbSMartin Matuska# in the git history but are not yet credited.
2823cf27dbSMartin Matuska#
2923cf27dbSMartin Matuska# The CONTRIBUTORS section of the AUTHORS file attempts to be a list of
3023cf27dbSMartin Matuska# individual contributors to OpenZFS, with one name, address and line per
3123cf27dbSMartin Matuska# person. This is good for readability, but does not really leave room for the
3223cf27dbSMartin Matuska# that names and emails on commits from the same individual can be different,
3323cf27dbSMartin Matuska# for all kinds of reasons, not limited to:
3423cf27dbSMartin Matuska#
3523cf27dbSMartin Matuska# - a person might change organisations, and so their email address changes
3623cf27dbSMartin Matuska#
3723cf27dbSMartin Matuska# - a person might be paid to work on OpenZFS for their employer, and then hack
3823cf27dbSMartin Matuska#   on personal projects in the evening, so commits legitimately come from
3923cf27dbSMartin Matuska#   different addresses
4023cf27dbSMartin Matuska#
4123cf27dbSMartin Matuska# - names change for all kinds of reasons
4223cf27dbSMartin Matuska#
4323cf27dbSMartin Matuska# To try and account for this, this program will try to find all the possible
4423cf27dbSMartin Matuska# names and emails for a single contributor, and then select the "best" one to
4523cf27dbSMartin Matuska# add to the AUTHORS file.
4623cf27dbSMartin Matuska#
4723cf27dbSMartin Matuska# The CONTRIBUTORS section of the AUTHORS file is considered the source of
4823cf27dbSMartin Matuska# truth. Once an individual committer is listed in there, that line will not be
4923cf27dbSMartin Matuska# removed regardless of what is discovered in the commit history. However, it
5023cf27dbSMartin Matuska# can't just be _anything_. The name or email still has to match something seen
5123cf27dbSMartin Matuska# in the commit history, so that we're able to undertand that its the same
5223cf27dbSMartin Matuska# contributor.
5323cf27dbSMartin Matuska#
5423cf27dbSMartin Matuska# The bulk of the work is in running `git log` to fetch commit author names and
5523cf27dbSMartin Matuska# emails. For each value, we generate a "slug" to use as an internal id for
5623cf27dbSMartin Matuska# that value, which is mostly just the lowercase of the value with whitespace
5723cf27dbSMartin Matuska# and punctuation removed. Two values with subtle differences can produce the
5823cf27dbSMartin Matuska# same slug, so at this point we also try to keep the "best" pre-slug value as
5923cf27dbSMartin Matuska# the display version. We use this slug to update two maps, one of email->name,
6023cf27dbSMartin Matuska# the other of name->email.
6123cf27dbSMartin Matuska#
62*b1c1ee44SMartin Matuska# Where possible, we also consider Signed-off-by: trailers in the commit
63*b1c1ee44SMartin Matuska# message, and if they match the commit author, enter them into the maps also.
64*b1c1ee44SMartin Matuska# Because a commit can contain multiple signoffs, we only track one if either
65*b1c1ee44SMartin Matuska# the name or the email address match the commit author (by slug). This is
66*b1c1ee44SMartin Matuska# mostly aimed at letting an explicit signoff override a generated name or
67*b1c1ee44SMartin Matuska# email on the same commit (usually a Github noreply), while avoiding every
68*b1c1ee44SMartin Matuska# signoff ever being treated as a possible canonical ident for some other
69*b1c1ee44SMartin Matuska# committer. (Also note that this behaviour only works for signoffs that can be
70*b1c1ee44SMartin Matuska# extracted with git-interpret-trailers, which misses many seen in the OpenZFS
71*b1c1ee44SMartin Matuska# git history, for various reasons).
72*b1c1ee44SMartin Matuska#
7323cf27dbSMartin Matuska# Once collected, we then walk all the emails we've seen and get all the names
7423cf27dbSMartin Matuska# associated with every instance. Then for each of those names, we get all the
7523cf27dbSMartin Matuska# emails associated, and so on until we've seen all the connected names and
7623cf27dbSMartin Matuska# emails. This collection is every possible name and email for an individual
7723cf27dbSMartin Matuska# contributor.
7823cf27dbSMartin Matuska#
7923cf27dbSMartin Matuska# Finaly, we consider these groups, and select the "best" name and email for
8023cf27dbSMartin Matuska# the contributor, and add them to the author tables if they aren't there
8123cf27dbSMartin Matuska# already. Once we've done everyone, we write out a new AUTHORS file, and
8223cf27dbSMartin Matuska# that's the whole job.
8323cf27dbSMartin Matuska#
8423cf27dbSMartin Matuska# This is imperfect! Its necessary for the user to examine the diff and make
8523cf27dbSMartin Matuska# sure its sensible. If it hasn't hooked up right, it may necessary to adjust
8623cf27dbSMartin Matuska# the input data (via .mailmap) or improve the heuristics in this program. It
8723cf27dbSMartin Matuska# took a long time to get into good shape when first written (355 new names
8823cf27dbSMartin Matuska# added to AUTHORS!) but hopefully in the future we'll be running this
8923cf27dbSMartin Matuska# regularly so it doesn't fall so far behind.
9023cf27dbSMartin Matuska
9123cf27dbSMartin Matuska
9223cf27dbSMartin Matuskause 5.010;
9323cf27dbSMartin Matuskause warnings;
9423cf27dbSMartin Matuskause strict;
9523cf27dbSMartin Matuska
9623cf27dbSMartin Matuska# Storage for the "best looking" version of name or email, keyed on slug.
9723cf27dbSMartin Matuskamy %display_name;
9823cf27dbSMartin Matuskamy %display_email;
9923cf27dbSMartin Matuska
10023cf27dbSMartin Matuska# First, we load the existing AUTHORS file. We save everything before
10123cf27dbSMartin Matuska# CONTRIBUTORS: line as-is so we can write it back out to the new file. Then
10223cf27dbSMartin Matuska# we extract name,email pairs from the remainder and store them in a pair of
10323cf27dbSMartin Matuska# hashtables, keyed on slug.
10423cf27dbSMartin Matuskamy %authors_name;
10523cf27dbSMartin Matuskamy %authors_email;
10623cf27dbSMartin Matuska
10723cf27dbSMartin Matuskamy @authors_header;
10823cf27dbSMartin Matuska
10923cf27dbSMartin Matuskafor my $line (do { local (@ARGV) = ('AUTHORS'); <> }) {
11023cf27dbSMartin Matuska	chomp $line;
11123cf27dbSMartin Matuska	state $in_header = 1;
11223cf27dbSMartin Matuska	if ($in_header) {
11323cf27dbSMartin Matuska		push @authors_header, $line;
11423cf27dbSMartin Matuska		$in_header = 0 if $line =~ m/^CONTRIBUTORS:/;
11523cf27dbSMartin Matuska	} else {
11623cf27dbSMartin Matuska		my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/;
11723cf27dbSMartin Matuska		next unless $name;
11823cf27dbSMartin Matuska
11923cf27dbSMartin Matuska		my $semail = email_slug($email);
12023cf27dbSMartin Matuska		my $sname = name_slug($name);
12123cf27dbSMartin Matuska
12223cf27dbSMartin Matuska		$authors_name{$semail} = $sname;
12323cf27dbSMartin Matuska		$authors_email{$sname} = $semail;
12423cf27dbSMartin Matuska
12523cf27dbSMartin Matuska		# The name/email in AUTHORS is already the "best looking"
12623cf27dbSMartin Matuska		# version, by definition.
12723cf27dbSMartin Matuska		$display_name{$sname} = $name;
12823cf27dbSMartin Matuska		$display_email{$semail} = $email;
12923cf27dbSMartin Matuska	}
13023cf27dbSMartin Matuska}
13123cf27dbSMartin Matuska
132*b1c1ee44SMartin Matuska# Next, we load all the commit authors and signoff pairs, and form name<->email
133*b1c1ee44SMartin Matuska# mappings, keyed on slug. Note that this format is getting the
134*b1c1ee44SMartin Matuska# .mailmap-converted form. This lets us control the input to some extent by
135*b1c1ee44SMartin Matuska# making changes there.
136*b1c1ee44SMartin Matuskamy %seen_names;
137*b1c1ee44SMartin Matuskamy %seen_emails;
13823cf27dbSMartin Matuska
139*b1c1ee44SMartin Matuska# The true email address from commits, by slug. We do this so we can generate
140*b1c1ee44SMartin Matuska# mailmap entries, which will only match the exact address from the commit,
141*b1c1ee44SMartin Matuska# not anything "prettified". This lets us remember the prefix part of Github
142*b1c1ee44SMartin Matuska# noreply addresses, while not including it in AUTHORS if that is truly the
143*b1c1ee44SMartin Matuska# best option we have.
144*b1c1ee44SMartin Matuskamy %commit_email;
145*b1c1ee44SMartin Matuska
146*b1c1ee44SMartin Matuskafor my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE:::%(trailers:key=signed-off-by,valueonly,separator=:::)')) {
14723cf27dbSMartin Matuska	chomp $line;
148*b1c1ee44SMartin Matuska	my ($name, $email, @signoffs) = split ':::', $line;
14923cf27dbSMartin Matuska	next unless $name && $email;
15023cf27dbSMartin Matuska
15123cf27dbSMartin Matuska	my $semail = email_slug($email);
15223cf27dbSMartin Matuska	my $sname = name_slug($name);
15323cf27dbSMartin Matuska
154*b1c1ee44SMartin Matuska	# Track the committer name and email.
155*b1c1ee44SMartin Matuska	$seen_names{$semail}{$sname} = 1;
156*b1c1ee44SMartin Matuska	$seen_emails{$sname}{$semail} = 1;
15723cf27dbSMartin Matuska
158*b1c1ee44SMartin Matuska	# Keep the original commit address.
159*b1c1ee44SMartin Matuska	$commit_email{$semail} = $email;
16023cf27dbSMartin Matuska
161*b1c1ee44SMartin Matuska	# Consider if these are the best we've ever seen.
16223cf27dbSMartin Matuska	update_display_name($name);
163*b1c1ee44SMartin Matuska	update_display_email($email);
164*b1c1ee44SMartin Matuska
165*b1c1ee44SMartin Matuska	# Check signoffs. any that have a matching name or email as the
166*b1c1ee44SMartin Matuska	# committer (by slug), also track them.
167*b1c1ee44SMartin Matuska	for my $signoff (@signoffs) {
168*b1c1ee44SMartin Matuska		my ($soname, $soemail) = $signoff =~ m/^([^<]+)\s+<(.+)>$/;
169*b1c1ee44SMartin Matuska		next unless $soname && $soemail;
170*b1c1ee44SMartin Matuska		my $ssoname = name_slug($soname);
171*b1c1ee44SMartin Matuska		my $ssoemail = email_slug($soemail);
172*b1c1ee44SMartin Matuska		if (($semail eq $ssoemail) ^ ($sname eq $ssoname)) {
173*b1c1ee44SMartin Matuska		    $seen_names{$ssoemail}{$ssoname} = 1;
174*b1c1ee44SMartin Matuska		    $seen_emails{$ssoname}{$ssoemail} = 1;
175*b1c1ee44SMartin Matuska		    update_display_name($soname);
176*b1c1ee44SMartin Matuska		    update_display_email($soemail);
177*b1c1ee44SMartin Matuska		}
17823cf27dbSMartin Matuska	}
17923cf27dbSMartin Matuska}
18023cf27dbSMartin Matuska
18123cf27dbSMartin Matuska# Now collect unique committers by all names+emails we've ever seen for them.
18223cf27dbSMartin Matuska# We start with emails and resolve all possible names, then we resolve the
18323cf27dbSMartin Matuska# emails for those names, and round and round until there's nothing left.
18423cf27dbSMartin Matuskamy @committers;
185*b1c1ee44SMartin Matuskafor my $start_email (sort keys %seen_names) {
18623cf27dbSMartin Matuska	# it might have been deleted already through a cross-reference
187*b1c1ee44SMartin Matuska	next unless $seen_names{$start_email};
18823cf27dbSMartin Matuska
18923cf27dbSMartin Matuska	my %emails;
19023cf27dbSMartin Matuska	my %names;
19123cf27dbSMartin Matuska
19223cf27dbSMartin Matuska	my @check_emails = ($start_email);
19323cf27dbSMartin Matuska	my @check_names;
19423cf27dbSMartin Matuska	while (@check_emails || @check_names) {
19523cf27dbSMartin Matuska		while (my $email = shift @check_emails) {
19623cf27dbSMartin Matuska			next if $emails{$email}++;
19723cf27dbSMartin Matuska			push @check_names,
198*b1c1ee44SMartin Matuska			    sort keys %{delete $seen_names{$email}};
19923cf27dbSMartin Matuska		}
20023cf27dbSMartin Matuska		while (my $name = shift @check_names) {
20123cf27dbSMartin Matuska			next if $names{$name}++;
20223cf27dbSMartin Matuska			push @check_emails,
203*b1c1ee44SMartin Matuska			    sort keys %{delete $seen_emails{$name}};
20423cf27dbSMartin Matuska		}
20523cf27dbSMartin Matuska	}
20623cf27dbSMartin Matuska
20723cf27dbSMartin Matuska	# A "committer" is the collection of connected names and emails.
20823cf27dbSMartin Matuska	push @committers, [[sort keys %emails], [sort keys %names]];
20923cf27dbSMartin Matuska}
21023cf27dbSMartin Matuska
21123cf27dbSMartin Matuska# Now we have our committers, we can work out what to add to AUTHORS.
21223cf27dbSMartin Matuskafor my $committer (@committers) {
21323cf27dbSMartin Matuska	my ($emails, $names) = @$committer;
21423cf27dbSMartin Matuska
21523cf27dbSMartin Matuska	# If this commiter is already in AUTHORS, we must not touch.
21623cf27dbSMartin Matuska	next if grep { $authors_name{$_} } @$emails;
21723cf27dbSMartin Matuska	next if grep { $authors_email{$_} } @$names;
21823cf27dbSMartin Matuska
21923cf27dbSMartin Matuska	# Decide on the "best" name and email to use
22023cf27dbSMartin Matuska	my $email = best_email(@$emails);
22123cf27dbSMartin Matuska	my $name = best_name(@$names);
22223cf27dbSMartin Matuska
22323cf27dbSMartin Matuska	$authors_email{$name} = $email;
22423cf27dbSMartin Matuska	$authors_name{$email} = $name;
225*b1c1ee44SMartin Matuska
226*b1c1ee44SMartin Matuska	# We've now selected our canonical name going forward. If there
227*b1c1ee44SMartin Matuska	# were other options from commit authors only (not signoffs),
228*b1c1ee44SMartin Matuska	# emit mailmap lines for the user to past into .mailmap
229*b1c1ee44SMartin Matuska	my $cemail = $display_email{email_slug($authors_email{$name})};
230*b1c1ee44SMartin Matuska	for my $alias (@$emails) {
231*b1c1ee44SMartin Matuska		next if $alias eq $email;
232*b1c1ee44SMartin Matuska
233*b1c1ee44SMartin Matuska		my $calias = $commit_email{$alias};
234*b1c1ee44SMartin Matuska		next unless $calias;
235*b1c1ee44SMartin Matuska
236*b1c1ee44SMartin Matuska		my $cname = $display_name{$name};
237*b1c1ee44SMartin Matuska		say "$cname <$cemail> <$calias>";
238*b1c1ee44SMartin Matuska	}
23923cf27dbSMartin Matuska}
24023cf27dbSMartin Matuska
24123cf27dbSMartin Matuska# Now output the new AUTHORS file
24223cf27dbSMartin Matuskaopen my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n";
24323cf27dbSMartin Matuskasay $fh join("\n", @authors_header, "");
24423cf27dbSMartin Matuskafor my $name (sort keys %authors_email) {
24523cf27dbSMartin Matuska	my $cname = $display_name{$name};
24623cf27dbSMartin Matuska	my $cemail = $display_email{email_slug($authors_email{$name})};
24723cf27dbSMartin Matuska	say $fh "    $cname <$cemail>";
24823cf27dbSMartin Matuska}
24923cf27dbSMartin Matuska
25023cf27dbSMartin Matuskaexit 0;
25123cf27dbSMartin Matuska
25223cf27dbSMartin Matuska# "Slugs" are used at the hashtable key for names and emails. They are used to
25323cf27dbSMartin Matuska# making two variants of a value be the "same" for matching. Mostly this is
25423cf27dbSMartin Matuska# to make upper and lower-case versions of a name or email compare the same,
25523cf27dbSMartin Matuska# but we do a little bit of munging to handle some common cases.
25623cf27dbSMartin Matuska#
25723cf27dbSMartin Matuska# Note that these are only used for matching internally; for display, the
25823cf27dbSMartin Matuska# slug will be used to look up the display form.
25923cf27dbSMartin Matuskasub name_slug {
26023cf27dbSMartin Matuska	my ($name) = @_;
26123cf27dbSMartin Matuska
26223cf27dbSMartin Matuska	# Remove spaces and dots, to handle differences in initials.
26323cf27dbSMartin Matuska	$name =~ s/[\s\.]//g;
26423cf27dbSMartin Matuska
26523cf27dbSMartin Matuska	return lc $name;
26623cf27dbSMartin Matuska}
26723cf27dbSMartin Matuskasub email_slug {
26823cf27dbSMartin Matuska	my ($email) = @_;
26923cf27dbSMartin Matuska
27023cf27dbSMartin Matuska	# Remove everything up to and including the first space, and the last
27123cf27dbSMartin Matuska	# space and everything after it.
27223cf27dbSMartin Matuska	$email =~ s/^(.*\s+)|(\s+.*)$//g;
27323cf27dbSMartin Matuska
27423cf27dbSMartin Matuska	# Remove the leading userid+ on Github noreply addresses. They're
27523cf27dbSMartin Matuska	# optional and we want to treat them as the same thing.
27623cf27dbSMartin Matuska	$email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;
27723cf27dbSMartin Matuska
27823cf27dbSMartin Matuska	return lc $email;
27923cf27dbSMartin Matuska}
28023cf27dbSMartin Matuska
281*b1c1ee44SMartin Matuska# As we accumulate new names and addresses, record the "best looking" version
282*b1c1ee44SMartin Matuska# of each. Once we decide to add a committer to AUTHORS, we'll take the best
283*b1c1ee44SMartin Matuska# version of their name and address from here.
284*b1c1ee44SMartin Matuska#
285*b1c1ee44SMartin Matuska# Note that we don't record them if they're already in AUTHORS (that is, in
286*b1c1ee44SMartin Matuska# %authors_name or %authors_email) because that file already contains the
287*b1c1ee44SMartin Matuska# "best" version, by definition. So we return immediately if we've seen it
288*b1c1ee44SMartin Matuska# there already.
28923cf27dbSMartin Matuskasub update_display_name {
29023cf27dbSMartin Matuska	my ($name) = @_;
29123cf27dbSMartin Matuska	my $sname = name_slug($name);
292*b1c1ee44SMartin Matuska	return if $authors_email{$sname};
29323cf27dbSMartin Matuska
29423cf27dbSMartin Matuska	# For names, "more specific" means "has more non-lower-case characters"
29523cf27dbSMartin Matuska	# (in ASCII), guessing that if a person has gone to some effort to
29623cf27dbSMartin Matuska	# specialise their name in a later commit, they presumably care more
29723cf27dbSMartin Matuska	# about it. If this is wrong, its probably better to add a .mailmap
29823cf27dbSMartin Matuska	# entry.
29923cf27dbSMartin Matuska
30023cf27dbSMartin Matuska	my $cname = $display_name{$sname};
30123cf27dbSMartin Matuska	if (!$cname ||
30223cf27dbSMartin Matuska	    ($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) {
30323cf27dbSMartin Matuska		$display_name{$sname} = $name;
30423cf27dbSMartin Matuska	}
30523cf27dbSMartin Matuska}
30623cf27dbSMartin Matuskasub update_display_email {
30723cf27dbSMartin Matuska	my ($email) = @_;
30823cf27dbSMartin Matuska	my $semail = email_slug($email);
309*b1c1ee44SMartin Matuska	return if $authors_name{$semail};
31023cf27dbSMartin Matuska
31123cf27dbSMartin Matuska	# Like names, we prefer uppercase when possible. We also remove any
31223cf27dbSMartin Matuska	# leading "plus address" for Github noreply addresses.
313*b1c1ee44SMartin Matuska
31423cf27dbSMartin Matuska	$email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;
31523cf27dbSMartin Matuska
31623cf27dbSMartin Matuska	my $cemail = $display_email{$semail};
31723cf27dbSMartin Matuska	if (!$cemail ||
31823cf27dbSMartin Matuska	    ($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) {
31923cf27dbSMartin Matuska		$display_email{$semail} = $email;
32023cf27dbSMartin Matuska	}
32123cf27dbSMartin Matuska}
32223cf27dbSMartin Matuska
32323cf27dbSMartin Matuskasub best_name {
32423cf27dbSMartin Matuska	my @names = sort {
32523cf27dbSMartin Matuska		my $cmp;
32623cf27dbSMartin Matuska		my ($aa) = $display_name{$a};
32723cf27dbSMartin Matuska		my ($bb) = $display_name{$b};
32823cf27dbSMartin Matuska
32923cf27dbSMartin Matuska		# The "best" name is very subjective, and a simple sort
33023cf27dbSMartin Matuska		# produced good-enough results, so I didn't try harder. Use of
33123cf27dbSMartin Matuska		# accented characters, punctuation and caps are probably an
33223cf27dbSMartin Matuska		# indicator of "better", but possibly we should also take into
33323cf27dbSMartin Matuska		# account the most recent name we saw, in case the committer
33423cf27dbSMartin Matuska		# has changed their name or nickname or similar.
33523cf27dbSMartin Matuska		#
33623cf27dbSMartin Matuska		# Really, .mailmap is the place to control this.
33723cf27dbSMartin Matuska
33823cf27dbSMartin Matuska		return ($aa cmp $bb);
33923cf27dbSMartin Matuska	} @_;
34023cf27dbSMartin Matuska
34123cf27dbSMartin Matuska	return shift @names;
34223cf27dbSMartin Matuska}
34323cf27dbSMartin Matuskasub best_email {
34423cf27dbSMartin Matuska	state $internal_re = qr/\.(?:internal|local|\(none\))$/;
34523cf27dbSMartin Matuska	state $noreply_re  = qr/\.noreply\.github\.com$/;
34623cf27dbSMartin Matuska	state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/;
34723cf27dbSMartin Matuska
34823cf27dbSMartin Matuska	my @emails = sort {
34923cf27dbSMartin Matuska		my $cmp;
35023cf27dbSMartin Matuska
35123cf27dbSMartin Matuska		# prefer address with a single @ over those without
35223cf27dbSMartin Matuska		$cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1);
35323cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
35423cf27dbSMartin Matuska
35523cf27dbSMartin Matuska		# prefer any address over internal/local addresses
35623cf27dbSMartin Matuska		$cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re));
35723cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
35823cf27dbSMartin Matuska
35923cf27dbSMartin Matuska		# prefer any address over github noreply aliases
36023cf27dbSMartin Matuska		$cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re));
36123cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
36223cf27dbSMartin Matuska
36323cf27dbSMartin Matuska		# prefer any address over freemail providers
36423cf27dbSMartin Matuska		$cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re));
36523cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
36623cf27dbSMartin Matuska
36723cf27dbSMartin Matuska		# alphabetical by domain
36823cf27dbSMartin Matuska		my ($alocal, $adom) = split /\@/, $a;
36923cf27dbSMartin Matuska		my ($blocal, $bdom) = split /\@/, $b;
37023cf27dbSMartin Matuska		$cmp = ($adom cmp $bdom);
37123cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
37223cf27dbSMartin Matuska
37323cf27dbSMartin Matuska		# alphabetical by local part
37423cf27dbSMartin Matuska		return ($alocal cmp $blocal);
37523cf27dbSMartin Matuska	} @_;
37623cf27dbSMartin Matuska
37723cf27dbSMartin Matuska	return shift @emails;
37823cf27dbSMartin Matuska}
379