xref: /linux/fs/ceph/metric.c (revision ac2dc6d57425ffa9629941d7c9d7c0e51082cb5a)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #include <linux/ceph/ceph_debug.h>
3 
4 #include <linux/types.h>
5 #include <linux/percpu_counter.h>
6 #include <linux/math64.h>
7 #include <linux/ratelimit.h>
8 
9 #include <linux/ceph/decode.h>
10 
11 #include "metric.h"
12 #include "mds_client.h"
13 
14 static bool metrics_disable_warned;
15 
ceph_subvolume_entry_payload_len(void)16 static inline u32 ceph_subvolume_entry_payload_len(void)
17 {
18 	return sizeof(struct ceph_subvolume_metric_entry_wire);
19 }
20 
ceph_subvolume_entry_encoded_len(void)21 static inline u32 ceph_subvolume_entry_encoded_len(void)
22 {
23 	return CEPH_ENCODING_START_BLK_LEN +
24 		ceph_subvolume_entry_payload_len();
25 }
26 
ceph_subvolume_outer_payload_len(u32 nr_subvols)27 static inline u32 ceph_subvolume_outer_payload_len(u32 nr_subvols)
28 {
29 	/* count is encoded as le64 (size_t on wire) to match FUSE client */
30 	return sizeof(__le64) +
31 		nr_subvols * ceph_subvolume_entry_encoded_len();
32 }
33 
ceph_subvolume_metric_data_len(u32 nr_subvols)34 static inline u32 ceph_subvolume_metric_data_len(u32 nr_subvols)
35 {
36 	return CEPH_ENCODING_START_BLK_LEN +
37 		ceph_subvolume_outer_payload_len(nr_subvols);
38 }
39 
ceph_subvolume_clamp_u32(u64 val)40 static inline u32 ceph_subvolume_clamp_u32(u64 val)
41 {
42 	return val > U32_MAX ? U32_MAX : (u32)val;
43 }
44 
ceph_init_subvolume_wire_entry(struct ceph_subvolume_metric_entry_wire * dst,const struct ceph_subvol_metric_snapshot * src)45 static void ceph_init_subvolume_wire_entry(
46 	struct ceph_subvolume_metric_entry_wire *dst,
47 	const struct ceph_subvol_metric_snapshot *src)
48 {
49 	dst->subvolume_id = cpu_to_le64(src->subvolume_id);
50 	dst->read_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->read_ops));
51 	dst->write_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->write_ops));
52 	dst->read_bytes = cpu_to_le64(src->read_bytes);
53 	dst->write_bytes = cpu_to_le64(src->write_bytes);
54 	dst->read_latency_us = cpu_to_le64(src->read_latency_us);
55 	dst->write_latency_us = cpu_to_le64(src->write_latency_us);
56 	dst->time_stamp = 0;
57 }
58 
ceph_encode_subvolume_metrics(void ** p,void * end,struct ceph_subvol_metric_snapshot * subvols,u32 nr_subvols)59 static int ceph_encode_subvolume_metrics(void **p, void *end,
60 					 struct ceph_subvol_metric_snapshot *subvols,
61 					 u32 nr_subvols)
62 {
63 	u32 i;
64 
65 	ceph_start_encoding(p, 1, 1,
66 			    ceph_subvolume_outer_payload_len(nr_subvols));
67 	/* count is encoded as le64 (size_t on wire) to match FUSE client */
68 	ceph_encode_64_safe(p, end, (u64)nr_subvols, enc_err);
69 
70 	for (i = 0; i < nr_subvols; i++) {
71 		struct ceph_subvolume_metric_entry_wire wire_entry;
72 
73 		ceph_init_subvolume_wire_entry(&wire_entry, &subvols[i]);
74 		ceph_start_encoding(p, 1, 1,
75 				    ceph_subvolume_entry_payload_len());
76 		ceph_encode_copy_safe(p, end, &wire_entry,
77 				      sizeof(wire_entry), enc_err);
78 	}
79 
80 	return 0;
81 enc_err:
82 	return -ERANGE;
83 }
84 
ktime_to_ceph_timespec(struct ceph_timespec * ts,ktime_t val)85 static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
86 {
87 	struct timespec64 t = ktime_to_timespec64(val);
88 	ceph_encode_timespec64(ts, &t);
89 }
90 
ceph_mdsc_send_metrics(struct ceph_mds_client * mdsc,struct ceph_mds_session * s)91 static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
92 				   struct ceph_mds_session *s)
93 {
94 	struct ceph_metric_head *head;
95 	struct ceph_metric_cap *cap;
96 	struct ceph_metric_read_latency *read;
97 	struct ceph_metric_write_latency *write;
98 	struct ceph_metric_metadata_latency *meta;
99 	struct ceph_metric_dlease *dlease;
100 	struct ceph_opened_files *files;
101 	struct ceph_pinned_icaps *icaps;
102 	struct ceph_opened_inodes *inodes;
103 	struct ceph_read_io_size *rsize;
104 	struct ceph_write_io_size *wsize;
105 	struct ceph_client_metric *m = &mdsc->metric;
106 	struct ceph_subvol_metric_snapshot *subvols = NULL;
107 	u64 nr_caps = atomic64_read(&m->total_caps);
108 	u32 header_len = sizeof(struct ceph_metric_header);
109 	struct ceph_client *cl = mdsc->fsc->client;
110 	struct ceph_msg *msg;
111 	u32 nr_subvols = 0;
112 	size_t subvol_len = 0;
113 	void *cursor;
114 	s64 sum;
115 	s32 items = 0;
116 	s32 len;
117 
118 	/* Do not send the metrics until the MDS rank is ready */
119 	mutex_lock(&mdsc->mutex);
120 	if (ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) != CEPH_MDS_STATE_ACTIVE) {
121 		mutex_unlock(&mdsc->mutex);
122 		return false;
123 	}
124 	mutex_unlock(&mdsc->mutex);
125 
126 	if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) &&
127 	    test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, &s->s_features)) {
128 		int ret;
129 
130 		ret = ceph_subvolume_metrics_snapshot(&mdsc->subvol_metrics,
131 						      &subvols, &nr_subvols,
132 						      true);
133 		if (ret) {
134 			pr_warn_client(cl, "failed to snapshot subvolume metrics: %d\n",
135 				       ret);
136 			/*
137 			 * On error, ceph_subvolume_metrics_snapshot() guarantees
138 			 * *out = NULL and *nr = 0 at function entry, so subvols
139 			 * is already NULL here - no cleanup needed.
140 			 */
141 			nr_subvols = 0;
142 			subvols = NULL;
143 		}
144 	}
145 
146 	if (nr_subvols) {
147 		/* type (le32) + ENCODE_START payload - no metric header */
148 		subvol_len = sizeof(__le32) +
149 			     ceph_subvolume_metric_data_len(nr_subvols);
150 	}
151 
152 	len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
153 	      + sizeof(*meta) + sizeof(*dlease) + sizeof(*files)
154 	      + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize)
155 	      + sizeof(*wsize) + subvol_len;
156 
157 	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
158 	if (!msg) {
159 		pr_err_client(cl, "to mds%d, failed to allocate message\n",
160 			      s->s_mds);
161 		kfree(subvols);
162 		return false;
163 	}
164 
165 	head = msg->front.iov_base;
166 
167 	/* encode the cap metric */
168 	cap = (struct ceph_metric_cap *)(head + 1);
169 	cap->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
170 	cap->header.ver = 1;
171 	cap->header.compat = 1;
172 	cap->header.data_len = cpu_to_le32(sizeof(*cap) - header_len);
173 	cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit));
174 	cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis));
175 	cap->total = cpu_to_le64(nr_caps);
176 	items++;
177 
178 	/* encode the read latency metric */
179 	read = (struct ceph_metric_read_latency *)(cap + 1);
180 	read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
181 	read->header.ver = 2;
182 	read->header.compat = 1;
183 	read->header.data_len = cpu_to_le32(sizeof(*read) - header_len);
184 	sum = m->metric[METRIC_READ].latency_sum;
185 	ktime_to_ceph_timespec(&read->lat, sum);
186 	ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg);
187 	read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum);
188 	read->count = cpu_to_le64(m->metric[METRIC_READ].total);
189 	items++;
190 
191 	/* encode the write latency metric */
192 	write = (struct ceph_metric_write_latency *)(read + 1);
193 	write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
194 	write->header.ver = 2;
195 	write->header.compat = 1;
196 	write->header.data_len = cpu_to_le32(sizeof(*write) - header_len);
197 	sum = m->metric[METRIC_WRITE].latency_sum;
198 	ktime_to_ceph_timespec(&write->lat, sum);
199 	ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg);
200 	write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum);
201 	write->count = cpu_to_le64(m->metric[METRIC_WRITE].total);
202 	items++;
203 
204 	/* encode the metadata latency metric */
205 	meta = (struct ceph_metric_metadata_latency *)(write + 1);
206 	meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
207 	meta->header.ver = 2;
208 	meta->header.compat = 1;
209 	meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len);
210 	sum = m->metric[METRIC_METADATA].latency_sum;
211 	ktime_to_ceph_timespec(&meta->lat, sum);
212 	ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg);
213 	meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum);
214 	meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total);
215 	items++;
216 
217 	/* encode the dentry lease metric */
218 	dlease = (struct ceph_metric_dlease *)(meta + 1);
219 	dlease->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
220 	dlease->header.ver = 1;
221 	dlease->header.compat = 1;
222 	dlease->header.data_len = cpu_to_le32(sizeof(*dlease) - header_len);
223 	dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit));
224 	dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis));
225 	dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries));
226 	items++;
227 
228 	sum = percpu_counter_sum(&m->total_inodes);
229 
230 	/* encode the opened files metric */
231 	files = (struct ceph_opened_files *)(dlease + 1);
232 	files->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES);
233 	files->header.ver = 1;
234 	files->header.compat = 1;
235 	files->header.data_len = cpu_to_le32(sizeof(*files) - header_len);
236 	files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files));
237 	files->total = cpu_to_le64(sum);
238 	items++;
239 
240 	/* encode the pinned icaps metric */
241 	icaps = (struct ceph_pinned_icaps *)(files + 1);
242 	icaps->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS);
243 	icaps->header.ver = 1;
244 	icaps->header.compat = 1;
245 	icaps->header.data_len = cpu_to_le32(sizeof(*icaps) - header_len);
246 	icaps->pinned_icaps = cpu_to_le64(nr_caps);
247 	icaps->total = cpu_to_le64(sum);
248 	items++;
249 
250 	/* encode the opened inodes metric */
251 	inodes = (struct ceph_opened_inodes *)(icaps + 1);
252 	inodes->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES);
253 	inodes->header.ver = 1;
254 	inodes->header.compat = 1;
255 	inodes->header.data_len = cpu_to_le32(sizeof(*inodes) - header_len);
256 	inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes));
257 	inodes->total = cpu_to_le64(sum);
258 	items++;
259 
260 	/* encode the read io size metric */
261 	rsize = (struct ceph_read_io_size *)(inodes + 1);
262 	rsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_IO_SIZES);
263 	rsize->header.ver = 1;
264 	rsize->header.compat = 1;
265 	rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len);
266 	rsize->total_ops = cpu_to_le64(m->metric[METRIC_READ].total);
267 	rsize->total_size = cpu_to_le64(m->metric[METRIC_READ].size_sum);
268 	items++;
269 
270 	/* encode the write io size metric */
271 	wsize = (struct ceph_write_io_size *)(rsize + 1);
272 	wsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_IO_SIZES);
273 	wsize->header.ver = 1;
274 	wsize->header.compat = 1;
275 	wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len);
276 	wsize->total_ops = cpu_to_le64(m->metric[METRIC_WRITE].total);
277 	wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum);
278 	items++;
279 
280 	cursor = wsize + 1;
281 
282 	if (nr_subvols) {
283 		void *payload;
284 		void *payload_end;
285 		int ret;
286 
287 		/* Emit only the type (le32), no ver/compat/data_len */
288 		ceph_encode_32(&cursor, CLIENT_METRIC_TYPE_SUBVOLUME_METRICS);
289 		items++;
290 
291 		payload = cursor;
292 		payload_end = (char *)payload +
293 			      ceph_subvolume_metric_data_len(nr_subvols);
294 
295 		ret = ceph_encode_subvolume_metrics(&payload, payload_end,
296 						    subvols, nr_subvols);
297 		if (ret) {
298 			pr_warn_client(cl,
299 				       "failed to encode subvolume metrics\n");
300 			kfree(subvols);
301 			ceph_msg_put(msg);
302 			return false;
303 		}
304 
305 		WARN_ON(payload != payload_end);
306 		cursor = payload;
307 	}
308 
309 	put_unaligned_le32(items, &head->num);
310 	msg->front.iov_len = (char *)cursor - (char *)head;
311 	msg->hdr.version = cpu_to_le16(1);
312 	msg->hdr.compat_version = cpu_to_le16(1);
313 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
314 
315 	ceph_con_send(&s->s_con, msg);
316 
317 	if (nr_subvols) {
318 		mutex_lock(&mdsc->subvol_metrics_last_mutex);
319 		kfree(mdsc->subvol_metrics_last);
320 		mdsc->subvol_metrics_last = subvols;
321 		mdsc->subvol_metrics_last_nr = nr_subvols;
322 		mdsc->subvol_metrics_sent += nr_subvols;
323 		mdsc->subvol_metrics_nonzero_sends++;
324 		mutex_unlock(&mdsc->subvol_metrics_last_mutex);
325 
326 		subvols = NULL;
327 	}
328 	kfree(subvols);
329 
330 	return true;
331 }
332 
333 
metric_get_session(struct ceph_mds_client * mdsc)334 static void metric_get_session(struct ceph_mds_client *mdsc)
335 {
336 	struct ceph_mds_session *s;
337 	int i;
338 
339 	mutex_lock(&mdsc->mutex);
340 	for (i = 0; i < mdsc->max_sessions; i++) {
341 		s = __ceph_lookup_mds_session(mdsc, i);
342 		if (!s)
343 			continue;
344 
345 		/*
346 		 * Skip it if MDS doesn't support the metric collection,
347 		 * or the MDS will close the session's socket connection
348 		 * directly when it get this message.
349 		 *
350 		 * Also skip sessions that don't support SUBVOLUME_METRICS
351 		 * when subvolume metrics collection is enabled. This ensures
352 		 * we only send subvolume metrics to MDSs that understand them.
353 		 * If no session supports the feature, metrics won't be sent.
354 		 */
355 		if (check_session_state(s) &&
356 		    test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) {
357 			if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) &&
358 			    !test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
359 				      &s->s_features)) {
360 				ceph_put_mds_session(s);
361 				continue;
362 			}
363 			mdsc->metric.session = s;
364 			break;
365 		}
366 
367 		ceph_put_mds_session(s);
368 	}
369 	mutex_unlock(&mdsc->mutex);
370 }
371 
metric_delayed_work(struct work_struct * work)372 static void metric_delayed_work(struct work_struct *work)
373 {
374 	struct ceph_client_metric *m =
375 		container_of(work, struct ceph_client_metric, delayed_work.work);
376 	struct ceph_mds_client *mdsc =
377 		container_of(m, struct ceph_mds_client, metric);
378 
379 	if (mdsc->stopping)
380 		return;
381 
382 	if (disable_send_metrics) {
383 		if (!metrics_disable_warned) {
384 			pr_info("ceph: metrics sending disabled via module parameter\n");
385 			metrics_disable_warned = true;
386 		}
387 		return;
388 	}
389 	metrics_disable_warned = false;
390 
391 	if (!m->session || !check_session_state(m->session)) {
392 		if (m->session) {
393 			ceph_put_mds_session(m->session);
394 			m->session = NULL;
395 		}
396 		metric_get_session(mdsc);
397 	}
398 
399 	if (m->session)
400 		ceph_mdsc_send_metrics(mdsc, m->session);
401 	else
402 		pr_warn_ratelimited("ceph: metrics worker has no MDS session\n");
403 
404 	metric_schedule_delayed(m);
405 }
406 
ceph_metric_init(struct ceph_client_metric * m)407 int ceph_metric_init(struct ceph_client_metric *m)
408 {
409 	struct ceph_metric *metric;
410 	int ret, i;
411 
412 	if (!m)
413 		return -EINVAL;
414 
415 	atomic64_set(&m->total_dentries, 0);
416 	ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL);
417 	if (ret)
418 		return ret;
419 
420 	ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL);
421 	if (ret)
422 		goto err_d_lease_mis;
423 
424 	atomic64_set(&m->total_caps, 0);
425 	ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL);
426 	if (ret)
427 		goto err_i_caps_hit;
428 
429 	ret = percpu_counter_init(&m->i_caps_mis, 0, GFP_KERNEL);
430 	if (ret)
431 		goto err_i_caps_mis;
432 
433 	for (i = 0; i < METRIC_MAX; i++) {
434 		metric = &m->metric[i];
435 		spin_lock_init(&metric->lock);
436 		metric->size_sum = 0;
437 		metric->size_min = U64_MAX;
438 		metric->size_max = 0;
439 		metric->total = 0;
440 		metric->latency_sum = 0;
441 		metric->latency_avg = 0;
442 		metric->latency_sq_sum = 0;
443 		metric->latency_min = KTIME_MAX;
444 		metric->latency_max = 0;
445 	}
446 
447 	atomic64_set(&m->opened_files, 0);
448 	ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
449 	if (ret)
450 		goto err_opened_inodes;
451 	ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL);
452 	if (ret)
453 		goto err_total_inodes;
454 
455 	m->session = NULL;
456 	INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
457 
458 	return 0;
459 
460 err_total_inodes:
461 	percpu_counter_destroy(&m->opened_inodes);
462 err_opened_inodes:
463 	percpu_counter_destroy(&m->i_caps_mis);
464 err_i_caps_mis:
465 	percpu_counter_destroy(&m->i_caps_hit);
466 err_i_caps_hit:
467 	percpu_counter_destroy(&m->d_lease_mis);
468 err_d_lease_mis:
469 	percpu_counter_destroy(&m->d_lease_hit);
470 
471 	return ret;
472 }
473 
ceph_metric_destroy(struct ceph_client_metric * m)474 void ceph_metric_destroy(struct ceph_client_metric *m)
475 {
476 	if (!m)
477 		return;
478 
479 	cancel_delayed_work_sync(&m->delayed_work);
480 
481 	percpu_counter_destroy(&m->total_inodes);
482 	percpu_counter_destroy(&m->opened_inodes);
483 	percpu_counter_destroy(&m->i_caps_mis);
484 	percpu_counter_destroy(&m->i_caps_hit);
485 	percpu_counter_destroy(&m->d_lease_mis);
486 	percpu_counter_destroy(&m->d_lease_hit);
487 
488 	ceph_put_mds_session(m->session);
489 }
490 
491 #define METRIC_UPDATE_MIN_MAX(min, max, new)	\
492 {						\
493 	if (unlikely(new < min))		\
494 		min = new;			\
495 	if (unlikely(new > max))		\
496 		max = new;			\
497 }
498 
__update_mean_and_stdev(ktime_t total,ktime_t * lavg,ktime_t * sq_sump,ktime_t lat)499 static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg,
500 					   ktime_t *sq_sump, ktime_t lat)
501 {
502 	ktime_t avg;
503 
504 	if (unlikely(total == 1)) {
505 		*lavg = lat;
506 	} else {
507 		/* the sq is (lat - old_avg) * (lat - new_avg) */
508 		avg = *lavg + div64_s64(lat - *lavg, total);
509 		*sq_sump += (lat - *lavg)*(lat - avg);
510 		*lavg = avg;
511 	}
512 }
513 
ceph_update_metrics(struct ceph_metric * m,ktime_t r_start,ktime_t r_end,unsigned int size,int rc)514 void ceph_update_metrics(struct ceph_metric *m,
515 			 ktime_t r_start, ktime_t r_end,
516 			 unsigned int size, int rc)
517 {
518 	ktime_t lat = ktime_sub(r_end, r_start);
519 	ktime_t total;
520 
521 	if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT))
522 		return;
523 
524 	spin_lock(&m->lock);
525 	total = ++m->total;
526 	m->size_sum += size;
527 	METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size);
528 	m->latency_sum += lat;
529 	METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat);
530 	__update_mean_and_stdev(total, &m->latency_avg,	&m->latency_sq_sum,
531 				lat);
532 	spin_unlock(&m->lock);
533 }
534