1#! /usr/bin/python 2# 3# This file and its contents are supplied under the terms of the 4# Common Development and Distribution License ("CDDL"), version 1.0. 5# You may only use this file in accordance with the terms of version 6# 1.0 of the CDDL. 7# 8# A full copy of the text of the CDDL should have accompanied this 9# source. A copy of the CDDL is also available via the Internet at 10# http://www.illumos.org/license/CDDL. 11# 12 13# 14# Copyright 2025 Richard Lowe 15# 16 17# 18# Check that source files contain only valid UTF-8 19# 20 21import io 22import sys 23 24def utf8check(path, output=sys.stderr): 25 # When a file is opened as binary Python specifies a b'\n' 26 # line-ending. Because all valid multi-byte utf-8 characters have the 27 # high bit set and this does not, it is safe for us to split a file 28 # line-wise _before_ we decode it, we will never split in the middle 29 # of a valid character, but may report an invalid character as if it 30 # were on two separate lines. 31 ret = 0 32 with io.open(path, 'rb') as fh: 33 errs = 0 34 for (lineno, line) in enumerate(fh): 35 try: 36 line.decode("utf-8") 37 except UnicodeDecodeError as e: 38 errs += 1 39 if errs < 10: 40 output.write(f"{path}: {lineno + 1}: {e}\n") 41 elif errs == 10: 42 output.write(f"{path}: ... and more ...\n") 43 ret = 1 44 return ret 45