xref: /freebsd/contrib/libdiff/lib/diff_atomize_text.c (revision 044243fcc9b4c639cf5655e37b98478bcb312590)
1  /* Split source by line breaks, and calculate a simplistic checksum. */
2  /*
3   * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4   *
5   * Permission to use, copy, modify, and distribute this software for any
6   * purpose with or without fee is hereby granted, provided that the above
7   * copyright notice and this permission notice appear in all copies.
8   *
9   * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10   * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11   * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12   * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16   */
17  
18  #include <errno.h>
19  #include <stdbool.h>
20  #include <stdint.h>
21  #include <stdio.h>
22  #include <stdlib.h>
23  #include <unistd.h>
24  #include <ctype.h>
25  
26  #include <arraylist.h>
27  #include <diff_main.h>
28  
29  #include "diff_internal.h"
30  #include "diff_debug.h"
31  
32  unsigned int
33  diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
34  {
35  	return hash * 23 + atom_byte;
36  }
37  
38  static int
39  diff_data_atomize_text_lines_fd(struct diff_data *d)
40  {
41  	off_t pos = 0;
42  	const off_t end = pos + d->len;
43  	unsigned int array_size_estimate = d->len / 50;
44  	unsigned int pow2 = 1;
45  	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
46  	bool embedded_nul = false;
47  
48  	while (array_size_estimate >>= 1)
49  		pow2++;
50  
51  	ARRAYLIST_INIT(d->atoms, 1 << pow2);
52  
53  	if (fseek(d->root->f, 0L, SEEK_SET) == -1)
54  		return errno;
55  
56  	while (pos < end) {
57  		off_t line_end = pos;
58  		unsigned int hash = 0;
59  		unsigned char buf[512];
60  		size_t r, i;
61  		struct diff_atom *atom;
62  		int eol = 0;
63  
64  		while (eol == 0 && line_end < end) {
65  			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
66  			if (r == 0 && ferror(d->root->f))
67  				return EIO;
68  			i = 0;
69  			while (eol == 0 && i < r) {
70  				if (buf[i] != '\r' && buf[i] != '\n') {
71  					if (!ignore_whitespace
72  					    || !isspace((unsigned char)buf[i]))
73  						hash = diff_atom_hash_update(
74  						    hash, buf[i]);
75  					if (buf[i] == '\0')
76  						embedded_nul = true;
77  					line_end++;
78  				} else
79  					eol = buf[i];
80  				i++;
81  			}
82  		}
83  
84  		/* When not at the end of data, the line ending char ('\r' or
85  		 * '\n') must follow */
86  		if (line_end < end)
87  			line_end++;
88  		/* If that was an '\r', also pull in any following '\n' */
89  		if (line_end < end && eol == '\r') {
90  			if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
91  				return errno;
92  			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
93  			if (r == 0 && ferror(d->root->f))
94  				return EIO;
95  			if (r > 0 && buf[0] == '\n')
96  				line_end++;
97  		}
98  
99  		/* Record the found line as diff atom */
100  		ARRAYLIST_ADD(atom, d->atoms);
101  		if (!atom)
102  			return ENOMEM;
103  
104  		*atom = (struct diff_atom){
105  			.root = d,
106  			.pos = pos,
107  			.at = NULL,	/* atom data is not memory-mapped */
108  			.len = line_end - pos,
109  			.hash = hash,
110  		};
111  
112  		/* Starting point for next line: */
113  		pos = line_end;
114  		if (fseeko(d->root->f, pos, SEEK_SET) == -1)
115  			return errno;
116  	}
117  
118  	/* File are considered binary if they contain embedded '\0' bytes. */
119  	if (embedded_nul)
120  		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
121  
122  	return DIFF_RC_OK;
123  }
124  
125  static int
126  diff_data_atomize_text_lines_mmap(struct diff_data *d)
127  {
128  	const uint8_t *pos = d->data;
129  	const uint8_t *end = pos + d->len;
130  	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
131  	bool embedded_nul = false;
132  	unsigned int array_size_estimate = d->len / 50;
133  	unsigned int pow2 = 1;
134  	while (array_size_estimate >>= 1)
135  		pow2++;
136  
137  	ARRAYLIST_INIT(d->atoms, 1 << pow2);
138  
139  	while (pos < end) {
140  		const uint8_t *line_end = pos;
141  		unsigned int hash = 0;
142  
143  		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
144  			if (!ignore_whitespace
145  			    || !isspace((unsigned char)*line_end))
146  				hash = diff_atom_hash_update(hash, *line_end);
147  			if (*line_end == '\0')
148  				embedded_nul = true;
149  			line_end++;
150  		}
151  
152  		/* When not at the end of data, the line ending char ('\r' or
153  		 * '\n') must follow */
154  		if (line_end < end && *line_end == '\r')
155  			line_end++;
156  		if (line_end < end && *line_end == '\n')
157  			line_end++;
158  
159  		/* Record the found line as diff atom */
160  		struct diff_atom *atom;
161  		ARRAYLIST_ADD(atom, d->atoms);
162  		if (!atom)
163  			return ENOMEM;
164  
165  		*atom = (struct diff_atom){
166  			.root = d,
167  			.pos = (off_t)(pos - d->data),
168  			.at = pos,
169  			.len = line_end - pos,
170  			.hash = hash,
171  		};
172  
173  		/* Starting point for next line: */
174  		pos = line_end;
175  	}
176  
177  	/* File are considered binary if they contain embedded '\0' bytes. */
178  	if (embedded_nul)
179  		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
180  
181  	return DIFF_RC_OK;
182  }
183  
184  static int
185  diff_data_atomize_text_lines(struct diff_data *d)
186  {
187  	if (d->data == NULL)
188  		return diff_data_atomize_text_lines_fd(d);
189  	else
190  		return diff_data_atomize_text_lines_mmap(d);
191  }
192  
193  int
194  diff_atomize_text_by_line(void *func_data, struct diff_data *d)
195  {
196  	return diff_data_atomize_text_lines(d);
197  }
198