xref: /freebsd/sys/contrib/openzfs/scripts/spdxcheck.pl (revision 3a8960711f4319f9b894ea2453c89065ee1b3a10)
1#!/usr/bin/env perl
2
3# SPDX-License-Identifier: MIT
4#
5# Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
6#
7# Permission is hereby granted, free of charge, to any person obtaining a copy
8# of this software and associated documentation files (the "Software"), to
9# deal in the Software without restriction, including without limitation the
10# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
11# sell copies of the Software, and to permit persons to whom the Software is
12# furnished to do so, subject to the following conditions:
13#
14# The above copyright notice and this permission notice shall be included in
15# all copies or substantial portions of the Software.
16#
17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23# IN THE SOFTWARE.
24
25use 5.010;
26use warnings;
27use strict;
28
29# All files known to git are either "tagged" or "untagged". Tagged files are
30# expected to have a license tag, while untagged files are expected to _not_
31# have a license tag. There is no "optional" tag; all files are either "tagged"
32# or "untagged".
33#
34# Whether or not a file is tagged or untagged is determined using the patterns
35# in $tagged_patterns and $untagged_patterns and the following sequence:
36#
37# - if the file's full path is explicity listed in $tagged_patterns, then the
38#   file is tagged.
39#
40# - if the file's full path is explicitly listed in $untagged_patterns, then
41#   file is untagged.
42#
43# - if the filename matches a pattern in $tagged_patterns, and does not match a
44#   pattern in $untagged_patterns, then the file is tagged
45#
46# - otherwise, the file is untagged.
47#
48# The patterns do a simple glob-like match over the entire path relative to the
49# root of the git repo (no leading /). '*' matches as anything at that point,
50# across path fragments. '?' matches a single character.
51
52my $tagged_patterns = q(
53	# Compiled source files
54	*.c
55	*.h
56	*.S
57
58	# Python files, eg test suite drivers, libzfs bindings
59	*.py
60	*.py.in
61
62	# Various support scripts
63	*.sh
64	*.pl
65
66	# Test suite
67	*.ksh
68	*.ksh.in
69	*.kshlib
70	*.kshlib.in
71	*.shlib
72
73	# Test suite data files
74	*.run
75	*.cfg
76	*.cfg.in
77	*.fio
78	*.lua
79	*.zcp
80
81	# Manpages
82	man/man?/*.?
83	man/man?/*.?.in
84
85	# Unsuffixed programs (or generated of same)
86	cmd/arcstat.in
87	cmd/arc_summary
88	cmd/dbufstat.in
89	cmd/zilstat.in
90	cmd/zpool/zpool.d/*
91	etc/init.d/zfs-import.in
92	etc/init.d/zfs-load-key.in
93	etc/init.d/zfs-mount.in
94	etc/init.d/zfs-share.in
95	etc/init.d/zfs-zed.in
96	etc/zfs/zfs-functions.in
97	scripts/objtool-wrapper.in
98
99	# Misc items that have clear licensing info but aren't easily matched,
100	# or are the first of a class that we aren't ready to match yet.
101	config/ax_code_coverage.m4
102	configure.ac
103	module/lua/README.zfs
104	scripts/kmodtool
105	tests/zfs-tests/tests/functional/inheritance/README.config
106	tests/zfs-tests/tests/functional/inheritance/README.state
107	cmd/zed/zed.d/statechange-notify.sh
108);
109
110my $untagged_patterns = q(
111	# Exclude CI tooling as it's not interesting for overall project
112	# licensing.
113	.github/*
114
115	# Everything below this has unclear licensing. Work is happening to
116	# identify and update them. Once one gains a tag it should be removed
117	# from this list.
118
119	cmd/zed/zed.d/*.sh
120	cmd/zpool/zpool.d/*
121
122	contrib/coverity/model.c
123	include/libzdb.h
124	include/os/freebsd/spl/sys/inttypes.h
125	include/os/freebsd/spl/sys/mode.h
126	include/os/freebsd/spl/sys/trace.h
127	include/os/freebsd/spl/sys/trace_zfs.h
128	include/os/freebsd/zfs/sys/zpl.h
129	include/os/linux/kernel/linux/page_compat.h
130	lib/libspl/include/os/freebsd/sys/sysmacros.h
131	lib/libspl/include/sys/string.h
132	lib/libspl/include/sys/trace_spl.h
133	lib/libspl/include/sys/trace_zfs.h
134	lib/libzdb/libzdb.c
135	module/lua/setjmp/setjmp.S
136	module/lua/setjmp/setjmp_ppc.S
137	module/zstd/include/sparc_compat.h
138	module/zstd/zstd_sparc.c
139	tests/zfs-tests/cmd/cp_files.c
140	tests/zfs-tests/cmd/zed_fd_spill-zedlet.c
141	tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c
142	tests/zfs-tests/tests/functional/tmpfile/tmpfile_002_pos.c
143	tests/zfs-tests/tests/functional/tmpfile/tmpfile_003_pos.c
144	tests/zfs-tests/tests/functional/tmpfile/tmpfile_test.c
145
146	autogen.sh
147	contrib/bpftrace/zfs-trace.sh
148	contrib/pyzfs/docs/source/conf.py
149	contrib/pyzfs/libzfs_core/test/__init__.py
150	contrib/pyzfs/setup.py.in
151	contrib/zcp/autosnap.lua
152	scripts/commitcheck.sh
153	scripts/man-dates.sh
154	scripts/mancheck.sh
155	scripts/paxcheck.sh
156	scripts/zfs-helpers.sh
157	scripts/zfs-tests-color.sh
158	scripts/zfs.sh
159	scripts/zimport.sh
160	tests/zfs-tests/callbacks/zfs_failsafe.ksh
161	tests/zfs-tests/include/commands.cfg
162	tests/zfs-tests/include/tunables.cfg
163	tests/zfs-tests/include/zpool_script.shlib
164	tests/zfs-tests/tests/functional/mv_files/random_creation.ksh
165);
166
167# For files expected to have a license tags, these are the acceptable tags by
168# path. A file in one of these paths with a tag not listed here must be in the
169# override list below. If the file is not in any of these paths, then
170# $default_license_tags is used.
171my $default_license_tags = [
172    'CDDL-1.0', '0BSD', 'BSD-2-Clause', 'BSD-3-Clause', 'MIT'
173];
174
175my @path_license_tags = (
176	# Conventional wisdom is that the Linux SPL must be GPL2+ for
177	# kernel compatibility.
178	'module/os/linux/spl' => ['GPL-2.0-or-later'],
179	'include/os/linux/spl' => ['GPL-2.0-or-later'],
180
181	# Third-party code should keep it's original license
182	'module/zstd/lib' => ['BSD-3-Clause OR GPL-2.0-only'],
183	'module/lua' => ['MIT'],
184
185	# lua/setjmp is platform-specific code sourced from various places
186	'module/lua/setjmp' => $default_license_tags,
187
188	# Some of the fletcher modules are dual-licensed
189	'module/zcommon/zfs_fletcher' =>
190	    ['BSD-2-Clause OR GPL-2.0-only', 'CDDL-1.0'],
191
192	'module/icp' => ['Apache-2.0', 'CDDL-1.0'],
193
194	# Python bindings are always Apache-2.0
195	'contrib/pyzfs' => ['Apache-2.0'],
196);
197
198# This is a list of "special case" license tags that are in use in the tree,
199# and the files where they occur. these exist for a variety of reasons, and
200# generally should not be used for new code. If you need to bring in code that
201# has a different license from the acceptable ones listed above, then you will
202# also need to add it here, with rationale provided and approval given in your
203# PR.
204my %override_file_license_tags = (
205
206	# SPDX have repeatedly rejected the creation of a tag for a public
207	# domain dedication, as not all dedications are clear and unambiguious
208	# in their meaning and not all jurisdictions permit relinquishing a
209	# copyright anyway.
210	#
211	# A reasonably common workaround appears to be to create a local
212	# (project-specific) identifier to convey whatever meaning the project
213	# wishes it to. To cover OpenZFS' use of third-party code with a
214	# public domain dedication, we use this custom tag.
215	#
216	# Further reading:
217	#   https://github.com/spdx/old-wiki/blob/main/Pages/Legal%20Team/Decisions/Dealing%20with%20Public%20Domain%20within%20SPDX%20Files.md
218	#   https://spdx.github.io/spdx-spec/v2.3/other-licensing-information-detected/
219	#   https://cr.yp.to/spdx.html
220	#
221	'LicenseRef-OpenZFS-ThirdParty-PublicDomain' => [qw(
222		include/sys/skein.h
223		module/icp/algs/skein/skein_block.c
224		module/icp/algs/skein/skein.c
225		module/icp/algs/skein/skein_impl.h
226		module/icp/algs/skein/skein_iv.c
227		module/icp/algs/skein/skein_port.h
228		module/zfs/vdev_draid_rand.c
229	)],
230
231	# Legacy inclusions
232	'Brian-Gladman-3-Clause' => [qw(
233		module/icp/asm-x86_64/aes/aestab.h
234		module/icp/asm-x86_64/aes/aesopt.h
235		module/icp/asm-x86_64/aes/aeskey.c
236		module/icp/asm-x86_64/aes/aes_amd64.S
237	)],
238	'OpenSSL-standalone' => [qw(
239		module/icp/asm-x86_64/aes/aes_aesni.S
240	)],
241	'LGPL-2.1-or-later' => [qw(
242		config/ax_code_coverage.m4
243	)],
244
245	# Legacy inclusions of BSD-2-Clause files in Linux SPL.
246	'BSD-2-Clause' => [qw(
247		include/os/linux/spl/sys/debug.h
248		module/os/linux/spl/spl-zone.c
249	)],
250
251	# Temporary overrides for things that have the wrong license for
252	# their path. Work is underway to understand and resolve these.
253	'GPL-2.0-or-later' => [qw(
254		include/os/freebsd/spl/sys/kstat.h
255		include/os/freebsd/spl/sys/sunddi.h
256		include/sys/mod.h
257	)],
258	'CDDL-1.0' => [qw(
259		include/os/linux/spl/sys/errno.h
260		include/os/linux/spl/sys/ia32/asm_linkage.h
261		include/os/linux/spl/sys/misc.h
262		include/os/linux/spl/sys/procfs_list.h
263		include/os/linux/spl/sys/trace.h
264		include/os/linux/spl/sys/trace_spl.h
265		include/os/linux/spl/sys/trace_taskq.h
266		include/os/linux/spl/sys/wmsum.h
267		module/os/linux/spl/spl-procfs-list.c
268		module/os/linux/spl/spl-trace.c
269		module/lua/README.zfs
270	)],
271);
272
273##########
274
275sub setup_patterns {
276	my ($patterns) = @_;
277
278	my @re;
279	my @files;
280
281	for my $pat (split "\n", $patterns) {
282		# remove leading/trailing whitespace and comments
283		$pat =~ s/(:?^\s*|\s*(:?#.*)?$)//g;
284		# skip (now-)empty lines
285		next if $pat eq '';
286
287		# if the "pattern" has no metachars, then it's a literal file
288		# path and gets matched a bit more strongly
289		unless ($pat =~ m/[?*]/) {
290			push @files, $pat;
291			next;
292		}
293
294		# naive pattern to regex conversion
295
296		# escape simple metachars
297		$pat =~ s/([\.\(\[])/\Q$1\E/g;
298
299		$pat =~ s/\?/./g;	# glob ? -> regex .
300		$pat =~ s/\*/.*/g;	# glob * -> regex .*
301
302		push @re, $pat;
303	}
304
305	my $re = join '|', @re;
306	return (qr/^(?:$re)$/, { map { $_ => 1 } @files });
307};
308
309my ($tagged_re, $tagged_files) = setup_patterns($tagged_patterns);
310my ($untagged_re, $untagged_files) = setup_patterns($untagged_patterns);
311
312sub file_is_tagged {
313	my ($file) = @_;
314
315	# explicitly tagged
316	if ($tagged_files->{$file}) {
317		delete $tagged_files->{$file};
318		return 1;
319	}
320
321	# explicitly untagged
322	if ($untagged_files->{$file}) {
323		delete $untagged_files->{$file};
324		return 0;
325	}
326
327	# must match tagged patterns and not match untagged patterns
328	return ($file =~ $tagged_re) && !($file =~ $untagged_re);
329}
330
331my %override_tags = map {
332	my $tag = $_;
333	map { $_ => $tag } @{$override_file_license_tags{$_}};
334} keys %override_file_license_tags;
335
336##########
337
338my $rc = 0;
339
340# Get a list of all files known to git. This is a crude way of avoiding any
341# build artifacts that have tags embedded in them.
342my @git_files = sort grep { chomp } qx(git ls-tree --name-only -r HEAD);
343
344# Scan all files and work out if their tags are correct.
345for my $file (@git_files) {
346	# Ignore non-files. git can store other types of objects (submodule
347	# dirs, symlinks, etc) that aren't interesting for licensing.
348	next unless -f $file && ! -l $file;
349
350	# Open the file, and extract its license tag. We only check the first
351	# 4K of each file because many of these files are large, binary, or
352	# both.  For a typical source file that means the tag should be found
353	# within the first ~50 lines.
354	open my $fh, '<', $file or die "$0: couldn't open $file: $!\n";
355	my $nbytes = read $fh, my $buf, 4096;
356	die "$0: couldn't read $file: $!\n" if !defined $nbytes;
357
358	my ($tag) =
359	    $buf =~ m/\bSPDX-License-Identifier: ([A-Za-z0-9_\-\. ]+)$/smg;
360
361	close $fh;
362
363	# Decide if the file should have a tag at all
364	my $tagged = file_is_tagged($file);
365
366	# If no license tag is wanted, there's not much left to do
367	if (!$tagged) {
368		if (defined $tag) {
369			# untagged file has a tag, pattern change required
370			say "unexpected license tag: $file";
371			$rc = 1;
372		}
373		next;
374	}
375
376	# If a tag is required, but doesn't have one, warn and loop.
377	if (!defined $tag) {
378		say "missing license tag: $file";
379		$rc = 1;
380		next;
381	}
382
383	# Determine the set of valid license tags for this file. Start with
384	# the defaults.
385	my $tags = $default_license_tags;
386
387	if ($override_tags{$file}) {
388		# File has an explicit override, use it.
389		$tags = [delete $override_tags{$file}];
390	} else {
391		# Work through the path tag sets, taking the set with the
392		# most precise match. If no sets match, we fall through and
393		# are left with the default set.
394		my $matchlen = 0;
395		for (my $n = 0; $n < @path_license_tags; $n += 2) {
396			my ($path, $t) = @path_license_tags[$n,$n+1];
397			if (substr($file, 0, length($path)) eq $path &&
398			    length($path) > $matchlen) {
399				$tags = $t;
400				$matchlen = length($path);
401			}
402		}
403	}
404
405	# Confirm the file's tag is in the set, and warn if not.
406	my %tags = map { $_ => 1 } @$tags;
407	unless ($tags{$tag}) {
408		say "invalid license tag: $file";
409		say "    (got $tag; expected: @$tags)";
410		$rc = 1;
411		next;
412	}
413}
414
415##########
416
417# List any files explicitly listed as tagged or untagged that we didn't see.
418# Likely the file was removed from the repo but not from our lists.
419
420for my $file (sort keys %$tagged_files) {
421	say "explicitly tagged file not on disk: $file";
422	$rc = 1;
423}
424for my $file (sort keys %$untagged_files) {
425	say "explicitly untagged file not on disk: $file";
426	$rc = 1;
427}
428for my $file (sort keys %override_tags) {
429	say "explicitly overridden file not on disk: $file";
430	$rc = 1;
431}
432
433exit $rc;
434