xref: /illumos-gate/usr/src/tools/onbld/Checks/UTF8Check.py (revision 7f3d7c9289dee6488b3cd2848a68c0b8580d750c)
1#! /usr/bin/python
2#
3# This file and its contents are supplied under the terms of the
4# Common Development and Distribution License ("CDDL"), version 1.0.
5# You may only use this file in accordance with the terms of version
6# 1.0 of the CDDL.
7#
8# A full copy of the text of the CDDL should have accompanied this
9# source.  A copy of the CDDL is also available via the Internet at
10# http://www.illumos.org/license/CDDL.
11#
12
13#
14# Copyright 2025 Richard Lowe
15#
16
17#
18# Check that source files contain only valid UTF-8
19#
20
21import io
22import sys
23
24def utf8check(path, output=sys.stderr):
25        # When a file is opened as binary Python specifies a b'\n'
26        # line-ending.  Because all valid multi-byte utf-8 characters have the
27        # high bit set and this does not, it is safe for us to split a file
28        # line-wise _before_ we decode it, we will never split in the middle
29        # of a valid character, but may report an invalid character as if it
30        # were on two separate lines.
31	ret = 0
32	with io.open(path, 'rb') as fh:
33		errs = 0
34		for (lineno, line) in enumerate(fh):
35			try:
36				line.decode("utf-8")
37			except UnicodeDecodeError as e:
38				errs += 1
39				if errs < 10:
40					output.write(f"{path}: {lineno + 1}: {e}\n")
41				elif errs == 10:
42					output.write(f"{path}: ... and more ...\n")
43				ret = 1
44	return ret
45