xref: /freebsd/contrib/libdiff/lib/diff_atomize_text.c (revision 601925180df4d165f4c2c4dc91a49da7b9f3438b)
1  /* Split source by line breaks, and calculate a simplistic checksum. */
2  /*
3   * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4   *
5   * Permission to use, copy, modify, and distribute this software for any
6   * purpose with or without fee is hereby granted, provided that the above
7   * copyright notice and this permission notice appear in all copies.
8   *
9   * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10   * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11   * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12   * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16   */
17  
18  #include <errno.h>
19  #include <setjmp.h>
20  #include <signal.h>
21  #include <stdbool.h>
22  #include <stdint.h>
23  #include <stdio.h>
24  #include <stdlib.h>
25  #include <unistd.h>
26  #include <ctype.h>
27  
28  #include <arraylist.h>
29  #include <diff_main.h>
30  
31  #include "diff_internal.h"
32  #include "diff_debug.h"
33  
34  unsigned int
35  diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
36  {
37  	return hash * 23 + atom_byte;
38  }
39  
40  static int
41  diff_data_atomize_text_lines_fd(struct diff_data *d)
42  {
43  	off_t pos = 0;
44  	const off_t end = pos + d->len;
45  	unsigned int array_size_estimate = d->len / 50;
46  	unsigned int pow2 = 1;
47  	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
48  	bool embedded_nul = false;
49  
50  	while (array_size_estimate >>= 1)
51  		pow2++;
52  
53  	ARRAYLIST_INIT(d->atoms, 1 << pow2);
54  
55  	if (fseek(d->root->f, 0L, SEEK_SET) == -1)
56  		return errno;
57  
58  	while (pos < end) {
59  		off_t line_end = pos;
60  		unsigned int hash = 0;
61  		unsigned char buf[512];
62  		size_t r, i;
63  		struct diff_atom *atom;
64  		int eol = 0;
65  
66  		while (eol == 0 && line_end < end) {
67  			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
68  			if (r == 0 && ferror(d->root->f))
69  				return EIO;
70  			i = 0;
71  			while (eol == 0 && i < r) {
72  				if (buf[i] != '\r' && buf[i] != '\n') {
73  					if (!ignore_whitespace
74  					    || !isspace((unsigned char)buf[i]))
75  						hash = diff_atom_hash_update(
76  						    hash, buf[i]);
77  					if (buf[i] == '\0')
78  						embedded_nul = true;
79  					line_end++;
80  				} else
81  					eol = buf[i];
82  				i++;
83  			}
84  		}
85  
86  		/* When not at the end of data, the line ending char ('\r' or
87  		 * '\n') must follow */
88  		if (line_end < end)
89  			line_end++;
90  		/* If that was an '\r', also pull in any following '\n' */
91  		if (line_end < end && eol == '\r') {
92  			if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
93  				return errno;
94  			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
95  			if (r == 0 && ferror(d->root->f))
96  				return EIO;
97  			if (r > 0 && buf[0] == '\n')
98  				line_end++;
99  		}
100  
101  		/* Record the found line as diff atom */
102  		ARRAYLIST_ADD(atom, d->atoms);
103  		if (!atom)
104  			return ENOMEM;
105  
106  		*atom = (struct diff_atom){
107  			.root = d,
108  			.pos = pos,
109  			.at = NULL,	/* atom data is not memory-mapped */
110  			.len = line_end - pos,
111  			.hash = hash,
112  		};
113  
114  		/* Starting point for next line: */
115  		pos = line_end;
116  		if (fseeko(d->root->f, pos, SEEK_SET) == -1)
117  			return errno;
118  	}
119  
120  	/* File are considered binary if they contain embedded '\0' bytes. */
121  	if (embedded_nul)
122  		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
123  
124  	return DIFF_RC_OK;
125  }
126  
127  static sigjmp_buf diff_data_signal_env;
128  static void
129  diff_data_signal_handler(int sig)
130  {
131  	siglongjmp(diff_data_signal_env, sig);
132  }
133  
134  static int
135  diff_data_atomize_text_lines_mmap(struct diff_data *d)
136  {
137  	struct sigaction act, oact;
138  	const uint8_t *volatile pos = d->data;
139  	const uint8_t *end = pos + d->len;
140  	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
141  	bool embedded_nul = false;
142  	unsigned int array_size_estimate = d->len / 50;
143  	unsigned int pow2 = 1;
144  	while (array_size_estimate >>= 1)
145  		pow2++;
146  
147  	ARRAYLIST_INIT(d->atoms, 1 << pow2);
148  
149  	sigemptyset(&act.sa_mask);
150  	act.sa_flags = 0;
151  	act.sa_handler = diff_data_signal_handler;
152  	sigaction(SIGBUS, &act, &oact);
153  	if (sigsetjmp(diff_data_signal_env, 0) > 0) {
154  		/*
155  		 * The file was truncated while we were reading it.  Set
156  		 * the end pointer to the beginning of the line we were
157  		 * trying to read, adjust the file length, and set a flag.
158  		 */
159  		end = pos;
160  		d->len = end - d->data;
161  		d->atomizer_flags |= DIFF_ATOMIZER_FILE_TRUNCATED;
162  	}
163  	while (pos < end) {
164  		const uint8_t *line_start = pos, *line_end = pos;
165  		unsigned int hash = 0;
166  
167  		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
168  			if (!ignore_whitespace
169  			    || !isspace((unsigned char)*line_end))
170  				hash = diff_atom_hash_update(hash, *line_end);
171  			if (*line_end == '\0')
172  				embedded_nul = true;
173  			line_end++;
174  		}
175  
176  		/* When not at the end of data, the line ending char ('\r' or
177  		 * '\n') must follow */
178  		if (line_end < end && *line_end == '\r')
179  			line_end++;
180  		if (line_end < end && *line_end == '\n')
181  			line_end++;
182  
183  		/* Record the found line as diff atom */
184  		struct diff_atom *atom;
185  		ARRAYLIST_ADD(atom, d->atoms);
186  		if (!atom)
187  			return ENOMEM;
188  
189  		*atom = (struct diff_atom){
190  			.root = d,
191  			.pos = (off_t)(line_start - d->data),
192  			.at = line_start,
193  			.len = line_end - line_start,
194  			.hash = hash,
195  		};
196  
197  		/* Starting point for next line: */
198  		pos = line_end;
199  	}
200  	sigaction(SIGBUS, &oact, NULL);
201  
202  	/* File are considered binary if they contain embedded '\0' bytes. */
203  	if (embedded_nul)
204  		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
205  
206  	return DIFF_RC_OK;
207  }
208  
209  static int
210  diff_data_atomize_text_lines(struct diff_data *d)
211  {
212  	if (d->data == NULL)
213  		return diff_data_atomize_text_lines_fd(d);
214  	else
215  		return diff_data_atomize_text_lines_mmap(d);
216  }
217  
218  int
219  diff_atomize_text_by_line(void *func_data, struct diff_data *d)
220  {
221  	return diff_data_atomize_text_lines(d);
222  }
223