1 /* 2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 * $Id: iser_memory.c 6964 2006-05-07 11:11:43Z ogerlitz $ 33 */ 34 #include <linux/module.h> 35 #include <linux/kernel.h> 36 #include <linux/slab.h> 37 #include <linux/mm.h> 38 #include <linux/highmem.h> 39 #include <asm/io.h> 40 #include <asm/scatterlist.h> 41 #include <linux/scatterlist.h> 42 43 #include "iscsi_iser.h" 44 45 #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ 46 47 /** 48 * Decrements the reference count for the 49 * registered buffer & releases it 50 * 51 * returns 0 if released, 1 if deferred 52 */ 53 int iser_regd_buff_release(struct iser_regd_buf *regd_buf) 54 { 55 struct device *dma_device; 56 57 if ((atomic_read(®d_buf->ref_count) == 0) || 58 atomic_dec_and_test(®d_buf->ref_count)) { 59 /* if we used the dma mr, unreg is just NOP */ 60 if (regd_buf->reg.is_fmr) 61 iser_unreg_mem(®d_buf->reg); 62 63 if (regd_buf->dma_addr) { 64 dma_device = regd_buf->device->ib_device->dma_device; 65 dma_unmap_single(dma_device, 66 regd_buf->dma_addr, 67 regd_buf->data_size, 68 regd_buf->direction); 69 } 70 /* else this regd buf is associated with task which we */ 71 /* dma_unmap_single/sg later */ 72 return 0; 73 } else { 74 iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf); 75 return 1; 76 } 77 } 78 79 /** 80 * iser_reg_single - fills registered buffer descriptor with 81 * registration information 82 */ 83 void iser_reg_single(struct iser_device *device, 84 struct iser_regd_buf *regd_buf, 85 enum dma_data_direction direction) 86 { 87 dma_addr_t dma_addr; 88 89 dma_addr = dma_map_single(device->ib_device->dma_device, 90 regd_buf->virt_addr, 91 regd_buf->data_size, direction); 92 BUG_ON(dma_mapping_error(dma_addr)); 93 94 regd_buf->reg.lkey = device->mr->lkey; 95 regd_buf->reg.len = regd_buf->data_size; 96 regd_buf->reg.va = dma_addr; 97 regd_buf->reg.is_fmr = 0; 98 99 regd_buf->dma_addr = dma_addr; 100 regd_buf->direction = direction; 101 } 102 103 /** 104 * iser_start_rdma_unaligned_sg 105 */ 106 int iser_start_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, 107 enum iser_data_dir cmd_dir) 108 { 109 int dma_nents; 110 struct device *dma_device; 111 char *mem = NULL; 112 struct iser_data_buf *data = &iser_ctask->data[cmd_dir]; 113 unsigned long cmd_data_len = data->data_len; 114 115 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 116 mem = (void *)__get_free_pages(GFP_NOIO, 117 ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 118 else 119 mem = kmalloc(cmd_data_len, GFP_NOIO); 120 121 if (mem == NULL) { 122 iser_err("Failed to allocate mem size %d %d for copying sglist\n", 123 data->size,(int)cmd_data_len); 124 return -ENOMEM; 125 } 126 127 if (cmd_dir == ISER_DIR_OUT) { 128 /* copy the unaligned sg the buffer which is used for RDMA */ 129 struct scatterlist *sg = (struct scatterlist *)data->buf; 130 int i; 131 char *p, *from; 132 133 for (p = mem, i = 0; i < data->size; i++) { 134 from = kmap_atomic(sg[i].page, KM_USER0); 135 memcpy(p, 136 from + sg[i].offset, 137 sg[i].length); 138 kunmap_atomic(from, KM_USER0); 139 p += sg[i].length; 140 } 141 } 142 143 sg_init_one(&iser_ctask->data_copy[cmd_dir].sg_single, mem, cmd_data_len); 144 iser_ctask->data_copy[cmd_dir].buf = 145 &iser_ctask->data_copy[cmd_dir].sg_single; 146 iser_ctask->data_copy[cmd_dir].size = 1; 147 148 iser_ctask->data_copy[cmd_dir].copy_buf = mem; 149 150 dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; 151 152 if (cmd_dir == ISER_DIR_OUT) 153 dma_nents = dma_map_sg(dma_device, 154 &iser_ctask->data_copy[cmd_dir].sg_single, 155 1, DMA_TO_DEVICE); 156 else 157 dma_nents = dma_map_sg(dma_device, 158 &iser_ctask->data_copy[cmd_dir].sg_single, 159 1, DMA_FROM_DEVICE); 160 161 BUG_ON(dma_nents == 0); 162 163 iser_ctask->data_copy[cmd_dir].dma_nents = dma_nents; 164 return 0; 165 } 166 167 /** 168 * iser_finalize_rdma_unaligned_sg 169 */ 170 void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, 171 enum iser_data_dir cmd_dir) 172 { 173 struct device *dma_device; 174 struct iser_data_buf *mem_copy; 175 unsigned long cmd_data_len; 176 177 dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; 178 mem_copy = &iser_ctask->data_copy[cmd_dir]; 179 180 if (cmd_dir == ISER_DIR_OUT) 181 dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, 182 DMA_TO_DEVICE); 183 else 184 dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, 185 DMA_FROM_DEVICE); 186 187 if (cmd_dir == ISER_DIR_IN) { 188 char *mem; 189 struct scatterlist *sg; 190 unsigned char *p, *to; 191 unsigned int sg_size; 192 int i; 193 194 /* copy back read RDMA to unaligned sg */ 195 mem = mem_copy->copy_buf; 196 197 sg = (struct scatterlist *)iser_ctask->data[ISER_DIR_IN].buf; 198 sg_size = iser_ctask->data[ISER_DIR_IN].size; 199 200 for (p = mem, i = 0; i < sg_size; i++){ 201 to = kmap_atomic(sg[i].page, KM_SOFTIRQ0); 202 memcpy(to + sg[i].offset, 203 p, 204 sg[i].length); 205 kunmap_atomic(to, KM_SOFTIRQ0); 206 p += sg[i].length; 207 } 208 } 209 210 cmd_data_len = iser_ctask->data[cmd_dir].data_len; 211 212 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 213 free_pages((unsigned long)mem_copy->copy_buf, 214 ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 215 else 216 kfree(mem_copy->copy_buf); 217 218 mem_copy->copy_buf = NULL; 219 } 220 221 /** 222 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses 223 * and returns the length of resulting physical address array (may be less than 224 * the original due to possible compaction). 225 * 226 * we build a "page vec" under the assumption that the SG meets the RDMA 227 * alignment requirements. Other then the first and last SG elements, all 228 * the "internal" elements can be compacted into a list whose elements are 229 * dma addresses of physical pages. The code supports also the weird case 230 * where --few fragments of the same page-- are present in the SG as 231 * consecutive elements. Also, it handles one entry SG. 232 */ 233 static int iser_sg_to_page_vec(struct iser_data_buf *data, 234 struct iser_page_vec *page_vec) 235 { 236 struct scatterlist *sg = (struct scatterlist *)data->buf; 237 dma_addr_t first_addr, last_addr, page; 238 int start_aligned, end_aligned; 239 unsigned int cur_page = 0; 240 unsigned long total_sz = 0; 241 int i; 242 243 /* compute the offset of first element */ 244 page_vec->offset = (u64) sg[0].offset & ~MASK_4K; 245 246 for (i = 0; i < data->dma_nents; i++) { 247 total_sz += sg_dma_len(&sg[i]); 248 249 first_addr = sg_dma_address(&sg[i]); 250 last_addr = first_addr + sg_dma_len(&sg[i]); 251 252 start_aligned = !(first_addr & ~MASK_4K); 253 end_aligned = !(last_addr & ~MASK_4K); 254 255 /* continue to collect page fragments till aligned or SG ends */ 256 while (!end_aligned && (i + 1 < data->dma_nents)) { 257 i++; 258 total_sz += sg_dma_len(&sg[i]); 259 last_addr = sg_dma_address(&sg[i]) + sg_dma_len(&sg[i]); 260 end_aligned = !(last_addr & ~MASK_4K); 261 } 262 263 /* handle the 1st page in the 1st DMA element */ 264 if (cur_page == 0) { 265 page = first_addr & MASK_4K; 266 page_vec->pages[cur_page] = page; 267 cur_page++; 268 page += SIZE_4K; 269 } else 270 page = first_addr; 271 272 for (; page < last_addr; page += SIZE_4K) { 273 page_vec->pages[cur_page] = page; 274 cur_page++; 275 } 276 277 } 278 page_vec->data_size = total_sz; 279 iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); 280 return cur_page; 281 } 282 283 #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) 284 285 /** 286 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned 287 * for RDMA sub-list of a scatter-gather list of memory buffers, and returns 288 * the number of entries which are aligned correctly. Supports the case where 289 * consecutive SG elements are actually fragments of the same physcial page. 290 */ 291 static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data) 292 { 293 struct scatterlist *sg; 294 dma_addr_t end_addr, next_addr; 295 int i, cnt; 296 unsigned int ret_len = 0; 297 298 sg = (struct scatterlist *)data->buf; 299 300 for (cnt = 0, i = 0; i < data->dma_nents; i++, cnt++) { 301 /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " 302 "offset: %ld sz: %ld\n", i, 303 (unsigned long)page_to_phys(sg[i].page), 304 (unsigned long)sg[i].offset, 305 (unsigned long)sg[i].length); */ 306 end_addr = sg_dma_address(&sg[i]) + 307 sg_dma_len(&sg[i]); 308 /* iser_dbg("Checking sg iobuf end address " 309 "0x%08lX\n", end_addr); */ 310 if (i + 1 < data->dma_nents) { 311 next_addr = sg_dma_address(&sg[i+1]); 312 /* are i, i+1 fragments of the same page? */ 313 if (end_addr == next_addr) 314 continue; 315 else if (!IS_4K_ALIGNED(end_addr)) { 316 ret_len = cnt + 1; 317 break; 318 } 319 } 320 } 321 if (i == data->dma_nents) 322 ret_len = cnt; /* loop ended */ 323 iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", 324 ret_len, data->dma_nents, data); 325 return ret_len; 326 } 327 328 static void iser_data_buf_dump(struct iser_data_buf *data) 329 { 330 struct scatterlist *sg = (struct scatterlist *)data->buf; 331 int i; 332 333 for (i = 0; i < data->dma_nents; i++) 334 iser_err("sg[%d] dma_addr:0x%lX page:0x%p " 335 "off:0x%x sz:0x%x dma_len:0x%x\n", 336 i, (unsigned long)sg_dma_address(&sg[i]), 337 sg[i].page, sg[i].offset, 338 sg[i].length,sg_dma_len(&sg[i])); 339 } 340 341 static void iser_dump_page_vec(struct iser_page_vec *page_vec) 342 { 343 int i; 344 345 iser_err("page vec length %d data size %d\n", 346 page_vec->length, page_vec->data_size); 347 for (i = 0; i < page_vec->length; i++) 348 iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); 349 } 350 351 static void iser_page_vec_build(struct iser_data_buf *data, 352 struct iser_page_vec *page_vec) 353 { 354 int page_vec_len = 0; 355 356 page_vec->length = 0; 357 page_vec->offset = 0; 358 359 iser_dbg("Translating sg sz: %d\n", data->dma_nents); 360 page_vec_len = iser_sg_to_page_vec(data,page_vec); 361 iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len); 362 363 page_vec->length = page_vec_len; 364 365 if (page_vec_len * SIZE_4K < page_vec->data_size) { 366 iser_err("page_vec too short to hold this SG\n"); 367 iser_data_buf_dump(data); 368 iser_dump_page_vec(page_vec); 369 BUG(); 370 } 371 } 372 373 int iser_dma_map_task_data(struct iscsi_iser_cmd_task *iser_ctask, 374 struct iser_data_buf *data, 375 enum iser_data_dir iser_dir, 376 enum dma_data_direction dma_dir) 377 { 378 struct device *dma_device; 379 380 iser_ctask->dir[iser_dir] = 1; 381 dma_device = 382 iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; 383 384 data->dma_nents = dma_map_sg(dma_device, data->buf, data->size, dma_dir); 385 if (data->dma_nents == 0) { 386 iser_err("dma_map_sg failed!!!\n"); 387 return -EINVAL; 388 } 389 return 0; 390 } 391 392 void iser_dma_unmap_task_data(struct iscsi_iser_cmd_task *iser_ctask) 393 { 394 struct device *dma_device; 395 struct iser_data_buf *data; 396 397 dma_device = 398 iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; 399 400 if (iser_ctask->dir[ISER_DIR_IN]) { 401 data = &iser_ctask->data[ISER_DIR_IN]; 402 dma_unmap_sg(dma_device, data->buf, data->size, DMA_FROM_DEVICE); 403 } 404 405 if (iser_ctask->dir[ISER_DIR_OUT]) { 406 data = &iser_ctask->data[ISER_DIR_OUT]; 407 dma_unmap_sg(dma_device, data->buf, data->size, DMA_TO_DEVICE); 408 } 409 } 410 411 /** 412 * iser_reg_rdma_mem - Registers memory intended for RDMA, 413 * obtaining rkey and va 414 * 415 * returns 0 on success, errno code on failure 416 */ 417 int iser_reg_rdma_mem(struct iscsi_iser_cmd_task *iser_ctask, 418 enum iser_data_dir cmd_dir) 419 { 420 struct iser_conn *ib_conn = iser_ctask->iser_conn->ib_conn; 421 struct iser_device *device = ib_conn->device; 422 struct iser_data_buf *mem = &iser_ctask->data[cmd_dir]; 423 struct iser_regd_buf *regd_buf; 424 int aligned_len; 425 int err; 426 int i; 427 struct scatterlist *sg; 428 429 regd_buf = &iser_ctask->rdma_regd[cmd_dir]; 430 431 aligned_len = iser_data_buf_aligned_len(mem); 432 if (aligned_len != mem->dma_nents) { 433 iser_err("rdma alignment violation %d/%d aligned\n", 434 aligned_len, mem->size); 435 iser_data_buf_dump(mem); 436 437 /* unmap the command data before accessing it */ 438 iser_dma_unmap_task_data(iser_ctask); 439 440 /* allocate copy buf, if we are writing, copy the */ 441 /* unaligned scatterlist, dma map the copy */ 442 if (iser_start_rdma_unaligned_sg(iser_ctask, cmd_dir) != 0) 443 return -ENOMEM; 444 mem = &iser_ctask->data_copy[cmd_dir]; 445 } 446 447 /* if there a single dma entry, FMR is not needed */ 448 if (mem->dma_nents == 1) { 449 sg = (struct scatterlist *)mem->buf; 450 451 regd_buf->reg.lkey = device->mr->lkey; 452 regd_buf->reg.rkey = device->mr->rkey; 453 regd_buf->reg.len = sg_dma_len(&sg[0]); 454 regd_buf->reg.va = sg_dma_address(&sg[0]); 455 regd_buf->reg.is_fmr = 0; 456 457 iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " 458 "va: 0x%08lX sz: %ld]\n", 459 (unsigned int)regd_buf->reg.lkey, 460 (unsigned int)regd_buf->reg.rkey, 461 (unsigned long)regd_buf->reg.va, 462 (unsigned long)regd_buf->reg.len); 463 } else { /* use FMR for multiple dma entries */ 464 iser_page_vec_build(mem, ib_conn->page_vec); 465 err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, ®d_buf->reg); 466 if (err) { 467 iser_data_buf_dump(mem); 468 iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", mem->dma_nents, 469 ntoh24(iser_ctask->desc.iscsi_header.dlength)); 470 iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", 471 ib_conn->page_vec->data_size, ib_conn->page_vec->length, 472 ib_conn->page_vec->offset); 473 for (i=0 ; i<ib_conn->page_vec->length ; i++) 474 iser_err("page_vec[%d] = 0x%llx\n", i, 475 (unsigned long long) ib_conn->page_vec->pages[i]); 476 return err; 477 } 478 } 479 480 /* take a reference on this regd buf such that it will not be released * 481 * (eg in send dto completion) before we get the scsi response */ 482 atomic_inc(®d_buf->ref_count); 483 return 0; 484 } 485