| Chris Metcalf | 867e359 | 2010-05-28 23:09:12 -0400 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | 
|  | 3 | * | 
|  | 4 | *   This program is free software; you can redistribute it and/or | 
|  | 5 | *   modify it under the terms of the GNU General Public License | 
|  | 6 | *   as published by the Free Software Foundation, version 2. | 
|  | 7 | * | 
|  | 8 | *   This program is distributed in the hope that it will be useful, but | 
|  | 9 | *   WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 10 | *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | 
|  | 11 | *   NON INFRINGEMENT.  See the GNU General Public License for | 
|  | 12 | *   more details. | 
|  | 13 | */ | 
|  | 14 |  | 
|  | 15 | #include <linux/string.h> | 
|  | 16 | #include <linux/smp.h> | 
|  | 17 | #include <linux/module.h> | 
|  | 18 | #include <linux/uaccess.h> | 
|  | 19 | #include <asm/fixmap.h> | 
|  | 20 | #include <asm/kmap_types.h> | 
|  | 21 | #include <asm/tlbflush.h> | 
|  | 22 | #include <hv/hypervisor.h> | 
|  | 23 | #include <arch/chip.h> | 
|  | 24 |  | 
|  | 25 |  | 
|  | 26 | #if !CHIP_HAS_COHERENT_LOCAL_CACHE() | 
|  | 27 |  | 
|  | 28 | /* Defined in memcpy.S */ | 
|  | 29 | extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n); | 
|  | 30 | extern unsigned long __copy_to_user_inatomic_asm( | 
|  | 31 | void __user *to, const void *from, unsigned long n); | 
|  | 32 | extern unsigned long __copy_from_user_inatomic_asm( | 
|  | 33 | void *to, const void __user *from, unsigned long n); | 
|  | 34 | extern unsigned long __copy_from_user_zeroing_asm( | 
|  | 35 | void *to, const void __user *from, unsigned long n); | 
|  | 36 |  | 
|  | 37 | typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long); | 
|  | 38 |  | 
|  | 39 | /* Size above which to consider TLB games for performance */ | 
|  | 40 | #define LARGE_COPY_CUTOFF 2048 | 
|  | 41 |  | 
|  | 42 | /* Communicate to the simulator what we are trying to do. */ | 
|  | 43 | #define sim_allow_multiple_caching(b) \ | 
|  | 44 | __insn_mtspr(SPR_SIM_CONTROL, \ | 
|  | 45 | SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS)) | 
|  | 46 |  | 
|  | 47 | /* | 
|  | 48 | * Copy memory by briefly enabling incoherent cacheline-at-a-time mode. | 
|  | 49 | * | 
|  | 50 | * We set up our own source and destination PTEs that we fully control. | 
|  | 51 | * This is the only way to guarantee that we don't race with another | 
|  | 52 | * thread that is modifying the PTE; we can't afford to try the | 
|  | 53 | * copy_{to,from}_user() technique of catching the interrupt, since | 
|  | 54 | * we must run with interrupts disabled to avoid the risk of some | 
|  | 55 | * other code seeing the incoherent data in our cache.  (Recall that | 
|  | 56 | * our cache is indexed by PA, so even if the other code doesn't use | 
|  | 57 | * our KM_MEMCPY virtual addresses, they'll still hit in cache using | 
|  | 58 | * the normal VAs that aren't supposed to hit in cache.) | 
|  | 59 | */ | 
|  | 60 | static void memcpy_multicache(void *dest, const void *source, | 
|  | 61 | pte_t dst_pte, pte_t src_pte, int len) | 
|  | 62 | { | 
| Chris Metcalf | 0707ad3 | 2010-06-25 17:04:17 -0400 | [diff] [blame] | 63 | int idx; | 
|  | 64 | unsigned long flags, newsrc, newdst; | 
| Chris Metcalf | 867e359 | 2010-05-28 23:09:12 -0400 | [diff] [blame] | 65 | pmd_t *pmdp; | 
|  | 66 | pte_t *ptep; | 
|  | 67 | int cpu = get_cpu(); | 
|  | 68 |  | 
|  | 69 | /* | 
|  | 70 | * Disable interrupts so that we don't recurse into memcpy() | 
|  | 71 | * in an interrupt handler, nor accidentally reference | 
|  | 72 | * the PA of the source from an interrupt routine.  Also | 
|  | 73 | * notify the simulator that we're playing games so we don't | 
|  | 74 | * generate spurious coherency warnings. | 
|  | 75 | */ | 
|  | 76 | local_irq_save(flags); | 
|  | 77 | sim_allow_multiple_caching(1); | 
|  | 78 |  | 
|  | 79 | /* Set up the new dest mapping */ | 
|  | 80 | idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + KM_MEMCPY0; | 
|  | 81 | newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1)); | 
|  | 82 | pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst); | 
|  | 83 | ptep = pte_offset_kernel(pmdp, newdst); | 
|  | 84 | if (pte_val(*ptep) != pte_val(dst_pte)) { | 
|  | 85 | set_pte(ptep, dst_pte); | 
|  | 86 | local_flush_tlb_page(NULL, newdst, PAGE_SIZE); | 
|  | 87 | } | 
|  | 88 |  | 
|  | 89 | /* Set up the new source mapping */ | 
|  | 90 | idx += (KM_MEMCPY0 - KM_MEMCPY1); | 
|  | 91 | src_pte = hv_pte_set_nc(src_pte); | 
|  | 92 | src_pte = hv_pte_clear_writable(src_pte);  /* be paranoid */ | 
|  | 93 | newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); | 
|  | 94 | pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); | 
|  | 95 | ptep = pte_offset_kernel(pmdp, newsrc); | 
|  | 96 | *ptep = src_pte;   /* set_pte() would be confused by this */ | 
|  | 97 | local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); | 
|  | 98 |  | 
|  | 99 | /* Actually move the data. */ | 
|  | 100 | __memcpy_asm((void *)newdst, (const void *)newsrc, len); | 
|  | 101 |  | 
|  | 102 | /* | 
|  | 103 | * Remap the source as locally-cached and not OLOC'ed so that | 
|  | 104 | * we can inval without also invaling the remote cpu's cache. | 
|  | 105 | * This also avoids known errata with inv'ing cacheable oloc data. | 
|  | 106 | */ | 
|  | 107 | src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); | 
|  | 108 | src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ | 
|  | 109 | *ptep = src_pte;   /* set_pte() would be confused by this */ | 
|  | 110 | local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); | 
|  | 111 |  | 
|  | 112 | /* | 
|  | 113 | * Do the actual invalidation, covering the full L2 cache line | 
|  | 114 | * at the end since __memcpy_asm() is somewhat aggressive. | 
|  | 115 | */ | 
|  | 116 | __inv_buffer((void *)newsrc, len); | 
|  | 117 |  | 
|  | 118 | /* | 
|  | 119 | * We're done: notify the simulator that all is back to normal, | 
|  | 120 | * and re-enable interrupts and pre-emption. | 
|  | 121 | */ | 
|  | 122 | sim_allow_multiple_caching(0); | 
|  | 123 | local_irq_restore(flags); | 
| Chris Metcalf | 0707ad3 | 2010-06-25 17:04:17 -0400 | [diff] [blame] | 124 | put_cpu(); | 
| Chris Metcalf | 867e359 | 2010-05-28 23:09:12 -0400 | [diff] [blame] | 125 | } | 
|  | 126 |  | 
|  | 127 | /* | 
|  | 128 | * Identify large copies from remotely-cached memory, and copy them | 
|  | 129 | * via memcpy_multicache() if they look good, otherwise fall back | 
|  | 130 | * to the particular kind of copying passed as the memcpy_t function. | 
|  | 131 | */ | 
|  | 132 | static unsigned long fast_copy(void *dest, const void *source, int len, | 
|  | 133 | memcpy_t func) | 
|  | 134 | { | 
|  | 135 | /* | 
|  | 136 | * Check if it's big enough to bother with.  We may end up doing a | 
|  | 137 | * small copy via TLB manipulation if we're near a page boundary, | 
|  | 138 | * but presumably we'll make it up when we hit the second page. | 
|  | 139 | */ | 
|  | 140 | while (len >= LARGE_COPY_CUTOFF) { | 
|  | 141 | int copy_size, bytes_left_on_page; | 
|  | 142 | pte_t *src_ptep, *dst_ptep; | 
|  | 143 | pte_t src_pte, dst_pte; | 
|  | 144 | struct page *src_page, *dst_page; | 
|  | 145 |  | 
|  | 146 | /* Is the source page oloc'ed to a remote cpu? */ | 
|  | 147 | retry_source: | 
|  | 148 | src_ptep = virt_to_pte(current->mm, (unsigned long)source); | 
|  | 149 | if (src_ptep == NULL) | 
|  | 150 | break; | 
|  | 151 | src_pte = *src_ptep; | 
|  | 152 | if (!hv_pte_get_present(src_pte) || | 
|  | 153 | !hv_pte_get_readable(src_pte) || | 
|  | 154 | hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3) | 
|  | 155 | break; | 
|  | 156 | if (get_remote_cache_cpu(src_pte) == smp_processor_id()) | 
|  | 157 | break; | 
|  | 158 | src_page = pfn_to_page(hv_pte_get_pfn(src_pte)); | 
|  | 159 | get_page(src_page); | 
|  | 160 | if (pte_val(src_pte) != pte_val(*src_ptep)) { | 
|  | 161 | put_page(src_page); | 
|  | 162 | goto retry_source; | 
|  | 163 | } | 
|  | 164 | if (pte_huge(src_pte)) { | 
|  | 165 | /* Adjust the PTE to correspond to a small page */ | 
|  | 166 | int pfn = hv_pte_get_pfn(src_pte); | 
|  | 167 | pfn += (((unsigned long)source & (HPAGE_SIZE-1)) | 
|  | 168 | >> PAGE_SHIFT); | 
|  | 169 | src_pte = pfn_pte(pfn, src_pte); | 
|  | 170 | src_pte = pte_mksmall(src_pte); | 
|  | 171 | } | 
|  | 172 |  | 
|  | 173 | /* Is the destination page writable? */ | 
|  | 174 | retry_dest: | 
|  | 175 | dst_ptep = virt_to_pte(current->mm, (unsigned long)dest); | 
|  | 176 | if (dst_ptep == NULL) { | 
|  | 177 | put_page(src_page); | 
|  | 178 | break; | 
|  | 179 | } | 
|  | 180 | dst_pte = *dst_ptep; | 
|  | 181 | if (!hv_pte_get_present(dst_pte) || | 
|  | 182 | !hv_pte_get_writable(dst_pte)) { | 
|  | 183 | put_page(src_page); | 
|  | 184 | break; | 
|  | 185 | } | 
|  | 186 | dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte)); | 
|  | 187 | if (dst_page == src_page) { | 
|  | 188 | /* | 
|  | 189 | * Source and dest are on the same page; this | 
|  | 190 | * potentially exposes us to incoherence if any | 
|  | 191 | * part of src and dest overlap on a cache line. | 
|  | 192 | * Just give up rather than trying to be precise. | 
|  | 193 | */ | 
|  | 194 | put_page(src_page); | 
|  | 195 | break; | 
|  | 196 | } | 
|  | 197 | get_page(dst_page); | 
|  | 198 | if (pte_val(dst_pte) != pte_val(*dst_ptep)) { | 
|  | 199 | put_page(dst_page); | 
|  | 200 | goto retry_dest; | 
|  | 201 | } | 
|  | 202 | if (pte_huge(dst_pte)) { | 
|  | 203 | /* Adjust the PTE to correspond to a small page */ | 
|  | 204 | int pfn = hv_pte_get_pfn(dst_pte); | 
|  | 205 | pfn += (((unsigned long)dest & (HPAGE_SIZE-1)) | 
|  | 206 | >> PAGE_SHIFT); | 
|  | 207 | dst_pte = pfn_pte(pfn, dst_pte); | 
|  | 208 | dst_pte = pte_mksmall(dst_pte); | 
|  | 209 | } | 
|  | 210 |  | 
|  | 211 | /* All looks good: create a cachable PTE and copy from it */ | 
|  | 212 | copy_size = len; | 
|  | 213 | bytes_left_on_page = | 
|  | 214 | PAGE_SIZE - (((int)source) & (PAGE_SIZE-1)); | 
|  | 215 | if (copy_size > bytes_left_on_page) | 
|  | 216 | copy_size = bytes_left_on_page; | 
|  | 217 | bytes_left_on_page = | 
|  | 218 | PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1)); | 
|  | 219 | if (copy_size > bytes_left_on_page) | 
|  | 220 | copy_size = bytes_left_on_page; | 
|  | 221 | memcpy_multicache(dest, source, dst_pte, src_pte, copy_size); | 
|  | 222 |  | 
|  | 223 | /* Release the pages */ | 
|  | 224 | put_page(dst_page); | 
|  | 225 | put_page(src_page); | 
|  | 226 |  | 
|  | 227 | /* Continue on the next page */ | 
|  | 228 | dest += copy_size; | 
|  | 229 | source += copy_size; | 
|  | 230 | len -= copy_size; | 
|  | 231 | } | 
|  | 232 |  | 
|  | 233 | return func(dest, source, len); | 
|  | 234 | } | 
|  | 235 |  | 
|  | 236 | void *memcpy(void *to, const void *from, __kernel_size_t n) | 
|  | 237 | { | 
|  | 238 | if (n < LARGE_COPY_CUTOFF) | 
|  | 239 | return (void *)__memcpy_asm(to, from, n); | 
|  | 240 | else | 
|  | 241 | return (void *)fast_copy(to, from, n, __memcpy_asm); | 
|  | 242 | } | 
|  | 243 |  | 
|  | 244 | unsigned long __copy_to_user_inatomic(void __user *to, const void *from, | 
|  | 245 | unsigned long n) | 
|  | 246 | { | 
|  | 247 | if (n < LARGE_COPY_CUTOFF) | 
|  | 248 | return __copy_to_user_inatomic_asm(to, from, n); | 
|  | 249 | else | 
|  | 250 | return fast_copy(to, from, n, __copy_to_user_inatomic_asm); | 
|  | 251 | } | 
|  | 252 |  | 
|  | 253 | unsigned long __copy_from_user_inatomic(void *to, const void __user *from, | 
|  | 254 | unsigned long n) | 
|  | 255 | { | 
|  | 256 | if (n < LARGE_COPY_CUTOFF) | 
|  | 257 | return __copy_from_user_inatomic_asm(to, from, n); | 
|  | 258 | else | 
|  | 259 | return fast_copy(to, from, n, __copy_from_user_inatomic_asm); | 
|  | 260 | } | 
|  | 261 |  | 
|  | 262 | unsigned long __copy_from_user_zeroing(void *to, const void __user *from, | 
|  | 263 | unsigned long n) | 
|  | 264 | { | 
|  | 265 | if (n < LARGE_COPY_CUTOFF) | 
|  | 266 | return __copy_from_user_zeroing_asm(to, from, n); | 
|  | 267 | else | 
|  | 268 | return fast_copy(to, from, n, __copy_from_user_zeroing_asm); | 
|  | 269 | } | 
|  | 270 |  | 
|  | 271 | #endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */ |