1 /* 2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 * $Id: iser_memory.c 6964 2006-05-07 11:11:43Z ogerlitz $ 33 */ 34 #include <linux/module.h> 35 #include <linux/kernel.h> 36 #include <linux/slab.h> 37 #include <linux/mm.h> 38 #include <asm/io.h> 39 #include <asm/scatterlist.h> 40 #include <linux/scatterlist.h> 41 42 #include "iscsi_iser.h" 43 44 #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ 45 /** 46 * Decrements the reference count for the 47 * registered buffer & releases it 48 * 49 * returns 0 if released, 1 if deferred 50 */ 51 int iser_regd_buff_release(struct iser_regd_buf *regd_buf) 52 { 53 struct device *dma_device; 54 55 if ((atomic_read(®d_buf->ref_count) == 0) || 56 atomic_dec_and_test(®d_buf->ref_count)) { 57 /* if we used the dma mr, unreg is just NOP */ 58 if (regd_buf->reg.rkey != 0) 59 iser_unreg_mem(®d_buf->reg); 60 61 if (regd_buf->dma_addr) { 62 dma_device = regd_buf->device->ib_device->dma_device; 63 dma_unmap_single(dma_device, 64 regd_buf->dma_addr, 65 regd_buf->data_size, 66 regd_buf->direction); 67 } 68 /* else this regd buf is associated with task which we */ 69 /* dma_unmap_single/sg later */ 70 return 0; 71 } else { 72 iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf); 73 return 1; 74 } 75 } 76 77 /** 78 * iser_reg_single - fills registered buffer descriptor with 79 * registration information 80 */ 81 void iser_reg_single(struct iser_device *device, 82 struct iser_regd_buf *regd_buf, 83 enum dma_data_direction direction) 84 { 85 dma_addr_t dma_addr; 86 87 dma_addr = dma_map_single(device->ib_device->dma_device, 88 regd_buf->virt_addr, 89 regd_buf->data_size, direction); 90 BUG_ON(dma_mapping_error(dma_addr)); 91 92 regd_buf->reg.lkey = device->mr->lkey; 93 regd_buf->reg.rkey = 0; /* indicate there's no need to unreg */ 94 regd_buf->reg.len = regd_buf->data_size; 95 regd_buf->reg.va = dma_addr; 96 97 regd_buf->dma_addr = dma_addr; 98 regd_buf->direction = direction; 99 } 100 101 /** 102 * iser_start_rdma_unaligned_sg 103 */ 104 int iser_start_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, 105 enum iser_data_dir cmd_dir) 106 { 107 int dma_nents; 108 struct device *dma_device; 109 char *mem = NULL; 110 struct iser_data_buf *data = &iser_ctask->data[cmd_dir]; 111 unsigned long cmd_data_len = data->data_len; 112 113 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 114 mem = (void *)__get_free_pages(GFP_NOIO, 115 long_log2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 116 else 117 mem = kmalloc(cmd_data_len, GFP_NOIO); 118 119 if (mem == NULL) { 120 iser_err("Failed to allocate mem size %d %d for copying sglist\n", 121 data->size,(int)cmd_data_len); 122 return -ENOMEM; 123 } 124 125 if (cmd_dir == ISER_DIR_OUT) { 126 /* copy the unaligned sg the buffer which is used for RDMA */ 127 struct scatterlist *sg = (struct scatterlist *)data->buf; 128 int i; 129 char *p, *from; 130 131 for (p = mem, i = 0; i < data->size; i++) { 132 from = kmap_atomic(sg[i].page, KM_USER0); 133 memcpy(p, 134 from + sg[i].offset, 135 sg[i].length); 136 kunmap_atomic(from, KM_USER0); 137 p += sg[i].length; 138 } 139 } 140 141 sg_init_one(&iser_ctask->data_copy[cmd_dir].sg_single, mem, cmd_data_len); 142 iser_ctask->data_copy[cmd_dir].buf = 143 &iser_ctask->data_copy[cmd_dir].sg_single; 144 iser_ctask->data_copy[cmd_dir].size = 1; 145 146 iser_ctask->data_copy[cmd_dir].copy_buf = mem; 147 148 dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; 149 150 if (cmd_dir == ISER_DIR_OUT) 151 dma_nents = dma_map_sg(dma_device, 152 &iser_ctask->data_copy[cmd_dir].sg_single, 153 1, DMA_TO_DEVICE); 154 else 155 dma_nents = dma_map_sg(dma_device, 156 &iser_ctask->data_copy[cmd_dir].sg_single, 157 1, DMA_FROM_DEVICE); 158 159 BUG_ON(dma_nents == 0); 160 161 iser_ctask->data_copy[cmd_dir].dma_nents = dma_nents; 162 return 0; 163 } 164 165 /** 166 * iser_finalize_rdma_unaligned_sg 167 */ 168 void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, 169 enum iser_data_dir cmd_dir) 170 { 171 struct device *dma_device; 172 struct iser_data_buf *mem_copy; 173 unsigned long cmd_data_len; 174 175 dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; 176 mem_copy = &iser_ctask->data_copy[cmd_dir]; 177 178 if (cmd_dir == ISER_DIR_OUT) 179 dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, 180 DMA_TO_DEVICE); 181 else 182 dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, 183 DMA_FROM_DEVICE); 184 185 if (cmd_dir == ISER_DIR_IN) { 186 char *mem; 187 struct scatterlist *sg; 188 unsigned char *p, *to; 189 unsigned int sg_size; 190 int i; 191 192 /* copy back read RDMA to unaligned sg */ 193 mem = mem_copy->copy_buf; 194 195 sg = (struct scatterlist *)iser_ctask->data[ISER_DIR_IN].buf; 196 sg_size = iser_ctask->data[ISER_DIR_IN].size; 197 198 for (p = mem, i = 0; i < sg_size; i++){ 199 to = kmap_atomic(sg[i].page, KM_SOFTIRQ0); 200 memcpy(to + sg[i].offset, 201 p, 202 sg[i].length); 203 kunmap_atomic(to, KM_SOFTIRQ0); 204 p += sg[i].length; 205 } 206 } 207 208 cmd_data_len = iser_ctask->data[cmd_dir].data_len; 209 210 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 211 free_pages((unsigned long)mem_copy->copy_buf, 212 long_log2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 213 else 214 kfree(mem_copy->copy_buf); 215 216 mem_copy->copy_buf = NULL; 217 } 218 219 /** 220 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses 221 * and returns the length of resulting physical address array (may be less than 222 * the original due to possible compaction). 223 * 224 * we build a "page vec" under the assumption that the SG meets the RDMA 225 * alignment requirements. Other then the first and last SG elements, all 226 * the "internal" elements can be compacted into a list whose elements are 227 * dma addresses of physical pages. The code supports also the weird case 228 * where --few fragments of the same page-- are present in the SG as 229 * consecutive elements. Also, it handles one entry SG. 230 */ 231 static int iser_sg_to_page_vec(struct iser_data_buf *data, 232 struct iser_page_vec *page_vec) 233 { 234 struct scatterlist *sg = (struct scatterlist *)data->buf; 235 dma_addr_t first_addr, last_addr, page; 236 int start_aligned, end_aligned; 237 unsigned int cur_page = 0; 238 unsigned long total_sz = 0; 239 int i; 240 241 /* compute the offset of first element */ 242 page_vec->offset = (u64) sg[0].offset; 243 244 for (i = 0; i < data->dma_nents; i++) { 245 total_sz += sg_dma_len(&sg[i]); 246 247 first_addr = sg_dma_address(&sg[i]); 248 last_addr = first_addr + sg_dma_len(&sg[i]); 249 250 start_aligned = !(first_addr & ~PAGE_MASK); 251 end_aligned = !(last_addr & ~PAGE_MASK); 252 253 /* continue to collect page fragments till aligned or SG ends */ 254 while (!end_aligned && (i + 1 < data->dma_nents)) { 255 i++; 256 total_sz += sg_dma_len(&sg[i]); 257 last_addr = sg_dma_address(&sg[i]) + sg_dma_len(&sg[i]); 258 end_aligned = !(last_addr & ~PAGE_MASK); 259 } 260 261 first_addr = first_addr & PAGE_MASK; 262 263 for (page = first_addr; page < last_addr; page += PAGE_SIZE) 264 page_vec->pages[cur_page++] = page; 265 266 } 267 page_vec->data_size = total_sz; 268 iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); 269 return cur_page; 270 } 271 272 #define MASK_4K ((1UL << 12) - 1) /* 0xFFF */ 273 #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & MASK_4K) == 0) 274 275 /** 276 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned 277 * for RDMA sub-list of a scatter-gather list of memory buffers, and returns 278 * the number of entries which are aligned correctly. Supports the case where 279 * consecutive SG elements are actually fragments of the same physcial page. 280 */ 281 static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data) 282 { 283 struct scatterlist *sg; 284 dma_addr_t end_addr, next_addr; 285 int i, cnt; 286 unsigned int ret_len = 0; 287 288 sg = (struct scatterlist *)data->buf; 289 290 for (cnt = 0, i = 0; i < data->dma_nents; i++, cnt++) { 291 /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " 292 "offset: %ld sz: %ld\n", i, 293 (unsigned long)page_to_phys(sg[i].page), 294 (unsigned long)sg[i].offset, 295 (unsigned long)sg[i].length); */ 296 end_addr = sg_dma_address(&sg[i]) + 297 sg_dma_len(&sg[i]); 298 /* iser_dbg("Checking sg iobuf end address " 299 "0x%08lX\n", end_addr); */ 300 if (i + 1 < data->dma_nents) { 301 next_addr = sg_dma_address(&sg[i+1]); 302 /* are i, i+1 fragments of the same page? */ 303 if (end_addr == next_addr) 304 continue; 305 else if (!IS_4K_ALIGNED(end_addr)) { 306 ret_len = cnt + 1; 307 break; 308 } 309 } 310 } 311 if (i == data->dma_nents) 312 ret_len = cnt; /* loop ended */ 313 iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", 314 ret_len, data->dma_nents, data); 315 return ret_len; 316 } 317 318 static void iser_data_buf_dump(struct iser_data_buf *data) 319 { 320 struct scatterlist *sg = (struct scatterlist *)data->buf; 321 int i; 322 323 for (i = 0; i < data->size; i++) 324 iser_err("sg[%d] dma_addr:0x%lX page:0x%p " 325 "off:%d sz:%d dma_len:%d\n", 326 i, (unsigned long)sg_dma_address(&sg[i]), 327 sg[i].page, sg[i].offset, 328 sg[i].length,sg_dma_len(&sg[i])); 329 } 330 331 static void iser_dump_page_vec(struct iser_page_vec *page_vec) 332 { 333 int i; 334 335 iser_err("page vec length %d data size %d\n", 336 page_vec->length, page_vec->data_size); 337 for (i = 0; i < page_vec->length; i++) 338 iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); 339 } 340 341 static void iser_page_vec_build(struct iser_data_buf *data, 342 struct iser_page_vec *page_vec) 343 { 344 int page_vec_len = 0; 345 346 page_vec->length = 0; 347 page_vec->offset = 0; 348 349 iser_dbg("Translating sg sz: %d\n", data->dma_nents); 350 page_vec_len = iser_sg_to_page_vec(data,page_vec); 351 iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len); 352 353 page_vec->length = page_vec_len; 354 355 if (page_vec_len * PAGE_SIZE < page_vec->data_size) { 356 iser_err("page_vec too short to hold this SG\n"); 357 iser_data_buf_dump(data); 358 iser_dump_page_vec(page_vec); 359 BUG(); 360 } 361 } 362 363 /** 364 * iser_reg_rdma_mem - Registers memory intended for RDMA, 365 * obtaining rkey and va 366 * 367 * returns 0 on success, errno code on failure 368 */ 369 int iser_reg_rdma_mem(struct iscsi_iser_cmd_task *iser_ctask, 370 enum iser_data_dir cmd_dir) 371 { 372 struct iser_conn *ib_conn = iser_ctask->iser_conn->ib_conn; 373 struct iser_data_buf *mem = &iser_ctask->data[cmd_dir]; 374 struct iser_regd_buf *regd_buf; 375 int aligned_len; 376 int err; 377 378 regd_buf = &iser_ctask->rdma_regd[cmd_dir]; 379 380 aligned_len = iser_data_buf_aligned_len(mem); 381 if (aligned_len != mem->size) { 382 iser_err("rdma alignment violation %d/%d aligned\n", 383 aligned_len, mem->size); 384 iser_data_buf_dump(mem); 385 /* allocate copy buf, if we are writing, copy the */ 386 /* unaligned scatterlist, dma map the copy */ 387 if (iser_start_rdma_unaligned_sg(iser_ctask, cmd_dir) != 0) 388 return -ENOMEM; 389 mem = &iser_ctask->data_copy[cmd_dir]; 390 } 391 392 iser_page_vec_build(mem, ib_conn->page_vec); 393 err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, ®d_buf->reg); 394 if (err) 395 return err; 396 397 /* take a reference on this regd buf such that it will not be released * 398 * (eg in send dto completion) before we get the scsi response */ 399 atomic_inc(®d_buf->ref_count); 400 return 0; 401 } 402