linux/drivers/misc/sgi-gru/grufault.c
<<
>>
Prefs
   1/*
   2 * SN Platform GRU Driver
   3 *
   4 *              FAULT HANDLER FOR GRU DETECTED TLB MISSES
   5 *
   6 * This file contains code that handles TLB misses within the GRU.
   7 * These misses are reported either via interrupts or user polling of
   8 * the user CB.
   9 *
  10 *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
  11 *
  12 *  This program is free software; you can redistribute it and/or modify
  13 *  it under the terms of the GNU General Public License as published by
  14 *  the Free Software Foundation; either version 2 of the License, or
  15 *  (at your option) any later version.
  16 *
  17 *  This program is distributed in the hope that it will be useful,
  18 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 *  GNU General Public License for more details.
  21 *
  22 *  You should have received a copy of the GNU General Public License
  23 *  along with this program; if not, write to the Free Software
  24 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  25 */
  26
  27#include <linux/kernel.h>
  28#include <linux/errno.h>
  29#include <linux/spinlock.h>
  30#include <linux/mm.h>
  31#include <linux/hugetlb.h>
  32#include <linux/device.h>
  33#include <linux/io.h>
  34#include <linux/uaccess.h>
  35#include <asm/pgtable.h>
  36#include "gru.h"
  37#include "grutables.h"
  38#include "grulib.h"
  39#include "gru_instructions.h"
  40#include <asm/uv/uv_hub.h>
  41
  42/*
  43 * Test if a physical address is a valid GRU GSEG address
  44 */
  45static inline int is_gru_paddr(unsigned long paddr)
  46{
  47        return paddr >= gru_start_paddr && paddr < gru_end_paddr;
  48}
  49
  50/*
  51 * Find the vma of a GRU segment. Caller must hold mmap_sem.
  52 */
  53struct vm_area_struct *gru_find_vma(unsigned long vaddr)
  54{
  55        struct vm_area_struct *vma;
  56
  57        vma = find_vma(current->mm, vaddr);
  58        if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops)
  59                return vma;
  60        return NULL;
  61}
  62
  63/*
  64 * Find and lock the gts that contains the specified user vaddr.
  65 *
  66 * Returns:
  67 *      - *gts with the mmap_sem locked for read and the GTS locked.
  68 *      - NULL if vaddr invalid OR is not a valid GSEG vaddr.
  69 */
  70
  71static struct gru_thread_state *gru_find_lock_gts(unsigned long vaddr)
  72{
  73        struct mm_struct *mm = current->mm;
  74        struct vm_area_struct *vma;
  75        struct gru_thread_state *gts = NULL;
  76
  77        down_read(&mm->mmap_sem);
  78        vma = gru_find_vma(vaddr);
  79        if (vma)
  80                gts = gru_find_thread_state(vma, TSID(vaddr, vma));
  81        if (gts)
  82                mutex_lock(&gts->ts_ctxlock);
  83        else
  84                up_read(&mm->mmap_sem);
  85        return gts;
  86}
  87
  88static struct gru_thread_state *gru_alloc_locked_gts(unsigned long vaddr)
  89{
  90        struct mm_struct *mm = current->mm;
  91        struct vm_area_struct *vma;
  92        struct gru_thread_state *gts = NULL;
  93
  94        down_write(&mm->mmap_sem);
  95        vma = gru_find_vma(vaddr);
  96        if (vma)
  97                gts = gru_alloc_thread_state(vma, TSID(vaddr, vma));
  98        if (gts) {
  99                mutex_lock(&gts->ts_ctxlock);
 100                downgrade_write(&mm->mmap_sem);
 101        } else {
 102                up_write(&mm->mmap_sem);
 103        }
 104
 105        return gts;
 106}
 107
 108/*
 109 * Unlock a GTS that was previously locked with gru_find_lock_gts().
 110 */
 111static void gru_unlock_gts(struct gru_thread_state *gts)
 112{
 113        mutex_unlock(&gts->ts_ctxlock);
 114        up_read(&current->mm->mmap_sem);
 115}
 116
 117/*
 118 * Set a CB.istatus to active using a user virtual address. This must be done
 119 * just prior to a TFH RESTART. The new cb.istatus is an in-cache status ONLY.
 120 * If the line is evicted, the status may be lost. The in-cache update
 121 * is necessary to prevent the user from seeing a stale cb.istatus that will
 122 * change as soon as the TFH restart is complete. Races may cause an
 123 * occasional failure to clear the cb.istatus, but that is ok.
 124 *
 125 * If the cb address is not valid (should not happen, but...), nothing
 126 * bad will happen.. The get_user()/put_user() will fail but there
 127 * are no bad side-effects.
 128 */
 129static void gru_cb_set_istatus_active(unsigned long __user *cb)
 130{
 131        union {
 132                struct gru_instruction_bits bits;
 133                unsigned long dw;
 134        } u;
 135
 136        if (cb) {
 137                get_user(u.dw, cb);
 138                u.bits.istatus = CBS_ACTIVE;
 139                put_user(u.dw, cb);
 140        }
 141}
 142
 143/*
 144 * Convert a interrupt IRQ to a pointer to the GRU GTS that caused the
 145 * interrupt. Interrupts are always sent to a cpu on the blade that contains the
 146 * GRU (except for headless blades which are not currently supported). A blade
 147 * has N grus; a block of N consecutive IRQs is assigned to the GRUs. The IRQ
 148 * number uniquely identifies the GRU chiplet on the local blade that caused the
 149 * interrupt. Always called in interrupt context.
 150 */
 151static inline struct gru_state *irq_to_gru(int irq)
 152{
 153        return &gru_base[uv_numa_blade_id()]->bs_grus[irq - IRQ_GRU];
 154}
 155
 156/*
 157 * Read & clear a TFM
 158 *
 159 * The GRU has an array of fault maps. A map is private to a cpu
 160 * Only one cpu will be accessing a cpu's fault map.
 161 *
 162 * This function scans the cpu-private fault map & clears all bits that
 163 * are set. The function returns a bitmap that indicates the bits that
 164 * were cleared. Note that sense the maps may be updated asynchronously by
 165 * the GRU, atomic operations must be used to clear bits.
 166 */
 167static void get_clear_fault_map(struct gru_state *gru,
 168                                struct gru_tlb_fault_map *map)
 169{
 170        unsigned long i, k;
 171        struct gru_tlb_fault_map *tfm;
 172
 173        tfm = get_tfm_for_cpu(gru, gru_cpu_fault_map_id());
 174        prefetchw(tfm);         /* Helps on hardware, required for emulator */
 175        for (i = 0; i < BITS_TO_LONGS(GRU_NUM_CBE); i++) {
 176                k = tfm->fault_bits[i];
 177                if (k)
 178                        k = xchg(&tfm->fault_bits[i], 0UL);
 179                map->fault_bits[i] = k;
 180        }
 181
 182        /*
 183         * Not functionally required but helps performance. (Required
 184         * on emulator)
 185         */
 186        gru_flush_cache(tfm);
 187}
 188
 189/*
 190 * Atomic (interrupt context) & non-atomic (user context) functions to
 191 * convert a vaddr into a physical address. The size of the page
 192 * is returned in pageshift.
 193 *      returns:
 194 *                0 - successful
 195 *              < 0 - error code
 196 *                1 - (atomic only) try again in non-atomic context
 197 */
 198static int non_atomic_pte_lookup(struct vm_area_struct *vma,
 199                                 unsigned long vaddr, int write,
 200                                 unsigned long *paddr, int *pageshift)
 201{
 202        struct page *page;
 203
 204        /* ZZZ Need to handle HUGE pages */
 205        if (is_vm_hugetlb_page(vma))
 206                return -EFAULT;
 207        *pageshift = PAGE_SHIFT;
 208        if (get_user_pages
 209            (current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0)
 210                return -EFAULT;
 211        *paddr = page_to_phys(page);
 212        put_page(page);
 213        return 0;
 214}
 215
 216/*
 217 * atomic_pte_lookup
 218 *
 219 * Convert a user virtual address to a physical address
 220 * Only supports Intel large pages (2MB only) on x86_64.
 221 *      ZZZ - hugepage support is incomplete
 222 *
 223 * NOTE: mmap_sem is already held on entry to this function. This
 224 * guarantees existence of the page tables.
 225 */
 226static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
 227        int write, unsigned long *paddr, int *pageshift)
 228{
 229        pgd_t *pgdp;
 230        pmd_t *pmdp;
 231        pud_t *pudp;
 232        pte_t pte;
 233
 234        pgdp = pgd_offset(vma->vm_mm, vaddr);
 235        if (unlikely(pgd_none(*pgdp)))
 236                goto err;
 237
 238        pudp = pud_offset(pgdp, vaddr);
 239        if (unlikely(pud_none(*pudp)))
 240                goto err;
 241
 242        pmdp = pmd_offset(pudp, vaddr);
 243        if (unlikely(pmd_none(*pmdp)))
 244                goto err;
 245#ifdef CONFIG_X86_64
 246        if (unlikely(pmd_large(*pmdp)))
 247                pte = *(pte_t *) pmdp;
 248        else
 249#endif
 250                pte = *pte_offset_kernel(pmdp, vaddr);
 251
 252        if (unlikely(!pte_present(pte) ||
 253                     (write && (!pte_write(pte) || !pte_dirty(pte)))))
 254                return 1;
 255
 256        *paddr = pte_pfn(pte) << PAGE_SHIFT;
 257#ifdef CONFIG_HUGETLB_PAGE
 258        *pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
 259#else
 260        *pageshift = PAGE_SHIFT;
 261#endif
 262        return 0;
 263
 264err:
 265        local_irq_enable();
 266        return 1;
 267}
 268
 269/*
 270 * Drop a TLB entry into the GRU. The fault is described by info in an TFH.
 271 *      Input:
 272 *              cb    Address of user CBR. Null if not running in user context
 273 *      Return:
 274 *                0 = dropin, exception, or switch to UPM successful
 275 *                1 = range invalidate active
 276 *              < 0 = error code
 277 *
 278 */
 279static int gru_try_dropin(struct gru_thread_state *gts,
 280                          struct gru_tlb_fault_handle *tfh,
 281                          unsigned long __user *cb)
 282{
 283        struct mm_struct *mm = gts->ts_mm;
 284        struct vm_area_struct *vma;
 285        int pageshift, asid, write, ret;
 286        unsigned long paddr, gpa, vaddr;
 287
 288        /*
 289         * NOTE: The GRU contains magic hardware that eliminates races between
 290         * TLB invalidates and TLB dropins. If an invalidate occurs
 291         * in the window between reading the TFH and the subsequent TLB dropin,
 292         * the dropin is ignored. This eliminates the need for additional locks.
 293         */
 294
 295        /*
 296         * Error if TFH state is IDLE or FMM mode & the user issuing a UPM call.
 297         * Might be a hardware race OR a stupid user. Ignore FMM because FMM
 298         * is a transient state.
 299         */
 300        if (tfh->state == TFHSTATE_IDLE)
 301                goto failidle;
 302        if (tfh->state == TFHSTATE_MISS_FMM && cb)
 303                goto failfmm;
 304
 305        write = (tfh->cause & TFHCAUSE_TLB_MOD) != 0;
 306        vaddr = tfh->missvaddr;
 307        asid = tfh->missasid;
 308        if (asid == 0)
 309                goto failnoasid;
 310
 311        rmb();  /* TFH must be cache resident before reading ms_range_active */
 312
 313        /*
 314         * TFH is cache resident - at least briefly. Fail the dropin
 315         * if a range invalidate is active.
 316         */
 317        if (atomic_read(&gts->ts_gms->ms_range_active))
 318                goto failactive;
 319
 320        vma = find_vma(mm, vaddr);
 321        if (!vma)
 322                goto failinval;
 323
 324        /*
 325         * Atomic lookup is faster & usually works even if called in non-atomic
 326         * context.
 327         */
 328        rmb();  /* Must/check ms_range_active before loading PTEs */
 329        ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &pageshift);
 330        if (ret) {
 331                if (!cb)
 332                        goto failupm;
 333                if (non_atomic_pte_lookup(vma, vaddr, write, &paddr,
 334                                          &pageshift))
 335                        goto failinval;
 336        }
 337        if (is_gru_paddr(paddr))
 338                goto failinval;
 339
 340        paddr = paddr & ~((1UL << pageshift) - 1);
 341        gpa = uv_soc_phys_ram_to_gpa(paddr);
 342        gru_cb_set_istatus_active(cb);
 343        tfh_write_restart(tfh, gpa, GAA_RAM, vaddr, asid, write,
 344                          GRU_PAGESIZE(pageshift));
 345        STAT(tlb_dropin);
 346        gru_dbg(grudev,
 347                "%s: tfh 0x%p, vaddr 0x%lx, asid 0x%x, ps %d, gpa 0x%lx\n",
 348                ret ? "non-atomic" : "atomic", tfh, vaddr, asid,
 349                pageshift, gpa);
 350        return 0;
 351
 352failnoasid:
 353        /* No asid (delayed unload). */
 354        STAT(tlb_dropin_fail_no_asid);
 355        gru_dbg(grudev, "FAILED no_asid tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
 356        if (!cb)
 357                tfh_user_polling_mode(tfh);
 358        else
 359                gru_flush_cache(tfh);
 360        return -EAGAIN;
 361
 362failupm:
 363        /* Atomic failure switch CBR to UPM */
 364        tfh_user_polling_mode(tfh);
 365        STAT(tlb_dropin_fail_upm);
 366        gru_dbg(grudev, "FAILED upm tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
 367        return 1;
 368
 369failfmm:
 370        /* FMM state on UPM call */
 371        STAT(tlb_dropin_fail_fmm);
 372        gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
 373        return 0;
 374
 375failidle:
 376        /* TFH was idle  - no miss pending */
 377        gru_flush_cache(tfh);
 378        if (cb)
 379                gru_flush_cache(cb);
 380        STAT(tlb_dropin_fail_idle);
 381        gru_dbg(grudev, "FAILED idle tfh: 0x%p, state %d\n", tfh, tfh->state);
 382        return 0;
 383
 384failinval:
 385        /* All errors (atomic & non-atomic) switch CBR to EXCEPTION state */
 386        tfh_exception(tfh);
 387        STAT(tlb_dropin_fail_invalid);
 388        gru_dbg(grudev, "FAILED inval tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
 389        return -EFAULT;
 390
 391failactive:
 392        /* Range invalidate active. Switch to UPM iff atomic */
 393        if (!cb)
 394                tfh_user_polling_mode(tfh);
 395        else
 396                gru_flush_cache(tfh);
 397        STAT(tlb_dropin_fail_range_active);
 398        gru_dbg(grudev, "FAILED range active: tfh 0x%p, vaddr 0x%lx\n",
 399                tfh, vaddr);
 400        return 1;
 401}
 402
 403/*
 404 * Process an external interrupt from the GRU. This interrupt is
 405 * caused by a TLB miss.
 406 * Note that this is the interrupt handler that is registered with linux
 407 * interrupt handlers.
 408 */
 409irqreturn_t gru_intr(int irq, void *dev_id)
 410{
 411        struct gru_state *gru;
 412        struct gru_tlb_fault_map map;
 413        struct gru_thread_state *gts;
 414        struct gru_tlb_fault_handle *tfh = NULL;
 415        int cbrnum, ctxnum;
 416
 417        STAT(intr);
 418
 419        gru = irq_to_gru(irq);
 420        if (!gru) {
 421                dev_err(grudev, "GRU: invalid interrupt: cpu %d, irq %d\n",
 422                        raw_smp_processor_id(), irq);
 423                return IRQ_NONE;
 424        }
 425        get_clear_fault_map(gru, &map);
 426        gru_dbg(grudev, "irq %d, gru %x, map 0x%lx\n", irq, gru->gs_gid,
 427                map.fault_bits[0]);
 428
 429        for_each_cbr_in_tfm(cbrnum, map.fault_bits) {
 430                tfh = get_tfh_by_index(gru, cbrnum);
 431                prefetchw(tfh); /* Helps on hdw, required for emulator */
 432
 433                /*
 434                 * When hardware sets a bit in the faultmap, it implicitly
 435                 * locks the GRU context so that it cannot be unloaded.
 436                 * The gts cannot change until a TFH start/writestart command
 437                 * is issued.
 438                 */
 439                ctxnum = tfh->ctxnum;
 440                gts = gru->gs_gts[ctxnum];
 441
 442                /*
 443                 * This is running in interrupt context. Trylock the mmap_sem.
 444                 * If it fails, retry the fault in user context.
 445                 */
 446                if (down_read_trylock(&gts->ts_mm->mmap_sem)) {
 447                        gru_try_dropin(gts, tfh, NULL);
 448                        up_read(&gts->ts_mm->mmap_sem);
 449                } else {
 450                        tfh_user_polling_mode(tfh);
 451                }
 452        }
 453        return IRQ_HANDLED;
 454}
 455
 456
 457static int gru_user_dropin(struct gru_thread_state *gts,
 458                           struct gru_tlb_fault_handle *tfh,
 459                           unsigned long __user *cb)
 460{
 461        struct gru_mm_struct *gms = gts->ts_gms;
 462        int ret;
 463
 464        while (1) {
 465                wait_event(gms->ms_wait_queue,
 466                           atomic_read(&gms->ms_range_active) == 0);
 467                prefetchw(tfh); /* Helps on hdw, required for emulator */
 468                ret = gru_try_dropin(gts, tfh, cb);
 469                if (ret <= 0)
 470                        return ret;
 471                STAT(call_os_wait_queue);
 472        }
 473}
 474
 475/*
 476 * This interface is called as a result of a user detecting a "call OS" bit
 477 * in a user CB. Normally means that a TLB fault has occurred.
 478 *      cb - user virtual address of the CB
 479 */
 480int gru_handle_user_call_os(unsigned long cb)
 481{
 482        struct gru_tlb_fault_handle *tfh;
 483        struct gru_thread_state *gts;
 484        unsigned long __user *cbp;
 485        int ucbnum, cbrnum, ret = -EINVAL;
 486
 487        STAT(call_os);
 488        gru_dbg(grudev, "address 0x%lx\n", cb);
 489
 490        /* sanity check the cb pointer */
 491        ucbnum = get_cb_number((void *)cb);
 492        if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB)
 493                return -EINVAL;
 494        cbp = (unsigned long *)cb;
 495
 496        gts = gru_find_lock_gts(cb);
 497        if (!gts)
 498                return -EINVAL;
 499
 500        if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
 501                ret = -EINVAL;
 502                goto exit;
 503        }
 504
 505        /*
 506         * If force_unload is set, the UPM TLB fault is phony. The task
 507         * has migrated to another node and the GSEG must be moved. Just
 508         * unload the context. The task will page fault and assign a new
 509         * context.
 510         */
 511        ret = -EAGAIN;
 512        cbrnum = thread_cbr_number(gts, ucbnum);
 513        if (gts->ts_force_unload) {
 514                gru_unload_context(gts, 1);
 515        } else if (gts->ts_gru) {
 516                tfh = get_tfh_by_index(gts->ts_gru, cbrnum);
 517                ret = gru_user_dropin(gts, tfh, cbp);
 518        }
 519exit:
 520        gru_unlock_gts(gts);
 521        return ret;
 522}
 523
 524/*
 525 * Fetch the exception detail information for a CB that terminated with
 526 * an exception.
 527 */
 528int gru_get_exception_detail(unsigned long arg)
 529{
 530        struct control_block_extended_exc_detail excdet;
 531        struct gru_control_block_extended *cbe;
 532        struct gru_thread_state *gts;
 533        int ucbnum, cbrnum, ret;
 534
 535        STAT(user_exception);
 536        if (copy_from_user(&excdet, (void __user *)arg, sizeof(excdet)))
 537                return -EFAULT;
 538
 539        gru_dbg(grudev, "address 0x%lx\n", excdet.cb);
 540        gts = gru_find_lock_gts(excdet.cb);
 541        if (!gts)
 542                return -EINVAL;
 543
 544        if (gts->ts_gru) {
 545                ucbnum = get_cb_number((void *)excdet.cb);
 546                cbrnum = thread_cbr_number(gts, ucbnum);
 547                cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
 548                prefetchw(cbe);         /* Harmless on hardware, required for emulator */
 549                excdet.opc = cbe->opccpy;
 550                excdet.exopc = cbe->exopccpy;
 551                excdet.ecause = cbe->ecause;
 552                excdet.exceptdet0 = cbe->idef1upd;
 553                excdet.exceptdet1 = cbe->idef3upd;
 554                ret = 0;
 555        } else {
 556                ret = -EAGAIN;
 557        }
 558        gru_unlock_gts(gts);
 559
 560        gru_dbg(grudev, "address 0x%lx, ecause 0x%x\n", excdet.cb,
 561                excdet.ecause);
 562        if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
 563                ret = -EFAULT;
 564        return ret;
 565}
 566
 567/*
 568 * User request to unload a context. Content is saved for possible reload.
 569 */
 570int gru_user_unload_context(unsigned long arg)
 571{
 572        struct gru_thread_state *gts;
 573        struct gru_unload_context_req req;
 574
 575        STAT(user_unload_context);
 576        if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
 577                return -EFAULT;
 578
 579        gru_dbg(grudev, "gseg 0x%lx\n", req.gseg);
 580
 581        gts = gru_find_lock_gts(req.gseg);
 582        if (!gts)
 583                return -EINVAL;
 584
 585        if (gts->ts_gru)
 586                gru_unload_context(gts, 1);
 587        gru_unlock_gts(gts);
 588
 589        return 0;
 590}
 591
 592/*
 593 * User request to flush a range of virtual addresses from the GRU TLB
 594 * (Mainly for testing).
 595 */
 596int gru_user_flush_tlb(unsigned long arg)
 597{
 598        struct gru_thread_state *gts;
 599        struct gru_flush_tlb_req req;
 600
 601        STAT(user_flush_tlb);
 602        if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
 603                return -EFAULT;
 604
 605        gru_dbg(grudev, "gseg 0x%lx, vaddr 0x%lx, len 0x%lx\n", req.gseg,
 606                req.vaddr, req.len);
 607
 608        gts = gru_find_lock_gts(req.gseg);
 609        if (!gts)
 610                return -EINVAL;
 611
 612        gru_flush_tlb_range(gts->ts_gms, req.vaddr, req.vaddr + req.len);
 613        gru_unlock_gts(gts);
 614
 615        return 0;
 616}
 617
 618/*
 619 * Register the current task as the user of the GSEG slice.
 620 * Needed for TLB fault interrupt targeting.
 621 */
 622int gru_set_task_slice(long address)
 623{
 624        struct gru_thread_state *gts;
 625
 626        STAT(set_task_slice);
 627        gru_dbg(grudev, "address 0x%lx\n", address);
 628        gts = gru_alloc_locked_gts(address);
 629        if (!gts)
 630                return -EINVAL;
 631
 632        gts->ts_tgid_owner = current->tgid;
 633        gru_unlock_gts(gts);
 634
 635        return 0;
 636}
 637
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.