From: mpi Date: Wed, 15 Dec 2021 12:53:53 +0000 (+0000) Subject: Use a per-UVM object lock to serialize the lower part of the fault handler. X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=69c0451460338d5bad6de6353f62ed17acdab844;p=openbsd Use a per-UVM object lock to serialize the lower part of the fault handler. Like the per-amap lock the `vmobjlock' is principally used to serialized access to objects in the fault handler to allow faults occurring on different CPUs and different objects to be processed in parallel. The fault handler now acquires the `vmobjlock' of a given UVM object as soon as it finds one. For now a write-lock is always acquired even if some operations could use a read-lock. Every pager, corresponding to a different kind of UVM object, now expect the UVM object to be locked and some operations, like *_get() return it unlocked. This is enforced by assertions checking for rw_write_held(). The KERNEL_LOCK() is now pushed to the VFS boundary in the vnode pager. To ensure the correct amap or object lock is held when modifying a page many uvm_page* operations are now asserting for the "owner" lock. However, fields of the "struct vm_page" are still being protected by the global `pageqlock'. To prevent lock ordering issues with the new `vmobjlock' and to reduce differences with NetBSD this lock is now taken and released for each page instead of around the whole loop. This commit does not remove the KERNEL_LOCK/UNLOCK() dance. Unlocking will follow if there is no fallout. Ported from NetBSD, tested by many, thanks! ok kettenis@, kn@ --- diff --git a/sys/dev/pci/drm/i915/gem/i915_gem_shmem.c b/sys/dev/pci/drm/i915/gem/i915_gem_shmem.c index ce8e2eca141..47b567087e7 100644 --- a/sys/dev/pci/drm/i915/gem/i915_gem_shmem.c +++ b/sys/dev/pci/drm/i915/gem/i915_gem_shmem.c @@ -268,8 +268,10 @@ shmem_truncate(struct drm_i915_gem_object *obj) #ifdef __linux__ shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1); #else + rw_enter(obj->base.uao->vmobjlock, RW_WRITE); obj->base.uao->pgops->pgo_flush(obj->base.uao, 0, obj->base.size, PGO_ALLPAGES | PGO_FREE); + rw_exit(obj->base.uao->vmobjlock); #endif obj->mm.madv = __I915_MADV_PURGED; obj->mm.pages = ERR_PTR(-EFAULT); diff --git a/sys/dev/pci/drm/radeon/radeon_ttm.c b/sys/dev/pci/drm/radeon/radeon_ttm.c index eb879b5c72c..837a9f94298 100644 --- a/sys/dev/pci/drm/radeon/radeon_ttm.c +++ b/sys/dev/pci/drm/radeon/radeon_ttm.c @@ -1006,6 +1006,8 @@ radeon_ttm_fault(struct uvm_faultinfo *ufi, vaddr_t vaddr, vm_page_t *pps, struct radeon_device *rdev; int r; + KASSERT(rw_write_held(ufi->entry->object.uvm_obj->vmobjlock)); + bo = (struct drm_gem_object *)ufi->entry->object.uvm_obj; rdev = bo->dev->dev_private; down_read(&rdev->pm.mclk_lock); diff --git a/sys/uvm/uvm_aobj.c b/sys/uvm/uvm_aobj.c index 20051d95dc1..1475760b6fc 100644 --- a/sys/uvm/uvm_aobj.c +++ b/sys/uvm/uvm_aobj.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_aobj.c,v 1.101 2021/10/24 13:46:14 mpi Exp $ */ +/* $OpenBSD: uvm_aobj.c,v 1.102 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_aobj.c,v 1.39 2001/02/18 21:19:08 chs Exp $ */ /* @@ -184,7 +184,7 @@ const struct uvm_pagerops aobj_pager = { * deadlock. */ static LIST_HEAD(aobjlist, uvm_aobj) uao_list = LIST_HEAD_INITIALIZER(uao_list); -static struct mutex uao_list_lock = MUTEX_INITIALIZER(IPL_NONE); +static struct mutex uao_list_lock = MUTEX_INITIALIZER(IPL_MPFLOOR); /* @@ -277,6 +277,7 @@ uao_find_swslot(struct uvm_object *uobj, int pageidx) * uao_set_swslot: set the swap slot for a page in an aobj. * * => setting a slot to zero frees the slot + * => object must be locked by caller * => we return the old slot number, or -1 if we failed to allocate * memory to record the new slot number */ @@ -286,7 +287,7 @@ uao_set_swslot(struct uvm_object *uobj, int pageidx, int slot) struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; int oldslot; - KERNEL_ASSERT_LOCKED(); + KASSERT(rw_write_held(uobj->vmobjlock) || uobj->uo_refs == 0); KASSERT(UVM_OBJ_IS_AOBJ(uobj)); /* @@ -358,7 +359,9 @@ uao_free(struct uvm_aobj *aobj) struct uvm_object *uobj = &aobj->u_obj; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); + KASSERT(rw_write_held(uobj->vmobjlock)); uao_dropswap_range(uobj, 0, 0); + rw_exit(uobj->vmobjlock); if (UAO_USES_SWHASH(aobj)) { /* @@ -671,6 +674,7 @@ struct uvm_object * uao_create(vsize_t size, int flags) { static struct uvm_aobj kernel_object_store; + static struct rwlock bootstrap_kernel_object_lock; static int kobj_alloced = 0; int pages = round_page(size) >> PAGE_SHIFT; struct uvm_aobj *aobj; @@ -742,6 +746,11 @@ uao_create(vsize_t size, int flags) * Initialise UVM object. */ uvm_obj_init(&aobj->u_obj, &aobj_pager, refs); + if (flags & UAO_FLAG_KERNOBJ) { + /* Use a temporary static lock for kernel_object. */ + rw_init(&bootstrap_kernel_object_lock, "kobjlk"); + uvm_obj_setlock(&aobj->u_obj, &bootstrap_kernel_object_lock); + } /* * now that aobj is ready, add it to the global list @@ -822,20 +831,20 @@ uao_detach(struct uvm_object *uobj) * involved in is complete), release any swap resources and free * the page itself. */ - uvm_lock_pageq(); - while((pg = RBT_ROOT(uvm_objtree, &uobj->memt)) != NULL) { + rw_enter(uobj->vmobjlock, RW_WRITE); + while ((pg = RBT_ROOT(uvm_objtree, &uobj->memt)) != NULL) { + pmap_page_protect(pg, PROT_NONE); if (pg->pg_flags & PG_BUSY) { atomic_setbits_int(&pg->pg_flags, PG_WANTED); - uvm_unlock_pageq(); - tsleep_nsec(pg, PVM, "uao_det", INFSLP); - uvm_lock_pageq(); + rwsleep_nsec(pg, uobj->vmobjlock, PVM, "uao_det", + INFSLP); continue; } - pmap_page_protect(pg, PROT_NONE); uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT); + uvm_lock_pageq(); uvm_pagefree(pg); + uvm_unlock_pageq(); } - uvm_unlock_pageq(); /* * Finally, free the anonymous UVM object itself. @@ -864,7 +873,7 @@ uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags) voff_t curoff; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); - KERNEL_ASSERT_LOCKED(); + KASSERT(rw_write_held(uobj->vmobjlock)); if (flags & PGO_ALLPAGES) { start = 0; @@ -901,7 +910,8 @@ uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags) /* Make sure page is unbusy, else wait for it. */ if (pp->pg_flags & PG_BUSY) { atomic_setbits_int(&pp->pg_flags, PG_WANTED); - tsleep_nsec(pp, PVM, "uaoflsh", INFSLP); + rwsleep_nsec(pp, uobj->vmobjlock, PVM, "uaoflsh", + INFSLP); curoff -= PAGE_SIZE; continue; } @@ -972,7 +982,7 @@ uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags) * 2: page is zero-fill -> allocate a new page and zero it. * 3: page is swapped out -> fetch the page from swap. * - * cases 1 and 2 can be handled with PGO_LOCKED, case 3 cannot. + * cases 1 can be handled with PGO_LOCKED, cases 2 and 3 cannot. * so, if the "center" page hits case 3 (or any page, with PGO_ALLPAGES), * then we will need to return VM_PAGER_UNLOCK. * @@ -992,7 +1002,7 @@ uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, boolean_t done; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); - KERNEL_ASSERT_LOCKED(); + KASSERT(rw_write_held(uobj->vmobjlock)); /* * get number of pages @@ -1115,7 +1125,10 @@ uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, /* out of RAM? */ if (ptmp == NULL) { + rw_exit(uobj->vmobjlock); uvm_wait("uao_getpage"); + rw_enter(uobj->vmobjlock, RW_WRITE); + /* goto top of pps while loop */ continue; } @@ -1135,7 +1148,8 @@ uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, /* page is there, see if we need to wait on it */ if ((ptmp->pg_flags & PG_BUSY) != 0) { atomic_setbits_int(&ptmp->pg_flags, PG_WANTED); - tsleep_nsec(ptmp, PVM, "uao_get", INFSLP); + rwsleep_nsec(ptmp, uobj->vmobjlock, PVM, + "uao_get", INFSLP); continue; /* goto top of pps while loop */ } @@ -1169,8 +1183,12 @@ uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, } else { /* * page in the swapped-out page. + * unlock object for i/o, relock when done. */ + + rw_exit(uobj->vmobjlock); rv = uvm_swap_get(ptmp, swslot, PGO_SYNCIO); + rw_enter(uobj->vmobjlock, RW_WRITE); /* * I/O done. check for errors. @@ -1194,6 +1212,7 @@ uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, uvm_lock_pageq(); uvm_pagefree(ptmp); uvm_unlock_pageq(); + rw_exit(uobj->vmobjlock); return rv; } @@ -1215,11 +1234,14 @@ uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, } /* lcv loop */ + rw_exit(uobj->vmobjlock); return VM_PAGER_OK; } /* * uao_dropswap: release any swap resources from this aobj page. + * + * => aobj must be locked or have a reference count of 0. */ int uao_dropswap(struct uvm_object *uobj, int pageidx) @@ -1238,6 +1260,7 @@ uao_dropswap(struct uvm_object *uobj, int pageidx) /* * page in every page in every aobj that is paged-out to a range of swslots. * + * => aobj must be locked and is returned locked. * => returns TRUE if pagein was aborted due to lack of memory. */ boolean_t @@ -1272,7 +1295,9 @@ uao_swap_off(int startslot, int endslot) /* * Page in all pages in the swap slot range. */ + rw_enter(aobj->u_obj.vmobjlock, RW_WRITE); rv = uao_pagein(aobj, startslot, endslot); + rw_exit(aobj->u_obj.vmobjlock); /* Drop the reference of the current object. */ uao_detach(&aobj->u_obj); @@ -1375,14 +1400,21 @@ restart: static boolean_t uao_pagein_page(struct uvm_aobj *aobj, int pageidx) { + struct uvm_object *uobj = &aobj->u_obj; struct vm_page *pg; int rv, slot, npages; pg = NULL; npages = 1; + + KASSERT(rw_write_held(uobj->vmobjlock)); rv = uao_get(&aobj->u_obj, (voff_t)pageidx << PAGE_SHIFT, &pg, &npages, 0, PROT_READ | PROT_WRITE, 0, 0); + /* + * relock and finish up. + */ + rw_enter(uobj->vmobjlock, RW_WRITE); switch (rv) { case VM_PAGER_OK: break; @@ -1430,7 +1462,7 @@ uao_dropswap_range(struct uvm_object *uobj, voff_t start, voff_t end) int swpgonlydelta = 0; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); - /* KASSERT(mutex_owned(uobj->vmobjlock)); */ + KASSERT(rw_write_held(uobj->vmobjlock)); if (end == 0) { end = INT64_MAX; diff --git a/sys/uvm/uvm_device.c b/sys/uvm/uvm_device.c index e5d035f2947..74d9490b326 100644 --- a/sys/uvm/uvm_device.c +++ b/sys/uvm/uvm_device.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_device.c,v 1.65 2021/10/23 14:42:08 mpi Exp $ */ +/* $OpenBSD: uvm_device.c,v 1.66 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_device.c,v 1.30 2000/11/25 06:27:59 chs Exp $ */ /* @@ -166,7 +166,9 @@ udv_attach(dev_t device, vm_prot_t accessprot, voff_t off, vsize_t size) /* * bump reference count, unhold, return. */ + rw_enter(lcv->u_obj.vmobjlock, RW_WRITE); lcv->u_obj.uo_refs++; + rw_exit(lcv->u_obj.vmobjlock); mtx_enter(&udv_lock); if (lcv->u_flags & UVM_DEVICE_WANTED) @@ -228,8 +230,9 @@ udv_attach(dev_t device, vm_prot_t accessprot, voff_t off, vsize_t size) static void udv_reference(struct uvm_object *uobj) { - KERNEL_ASSERT_LOCKED(); + rw_enter(uobj->vmobjlock, RW_WRITE); uobj->uo_refs++; + rw_exit(uobj->vmobjlock); } /* @@ -248,8 +251,10 @@ udv_detach(struct uvm_object *uobj) * loop until done */ again: + rw_enter(uobj->vmobjlock, RW_WRITE); if (uobj->uo_refs > 1) { uobj->uo_refs--; + rw_exit(uobj->vmobjlock); return; } KASSERT(uobj->uo_npages == 0 && RBT_EMPTY(uvm_objtree, &uobj->memt)); @@ -260,10 +265,7 @@ again: mtx_enter(&udv_lock); if (udv->u_flags & UVM_DEVICE_HOLD) { udv->u_flags |= UVM_DEVICE_WANTED; - /* - * lock interleaving. -- this is ok in this case since the - * locks are both IPL_NONE - */ + rw_exit(uobj->vmobjlock); msleep_nsec(udv, &udv_lock, PVM | PNORELOCK, "udv_detach", INFSLP); goto again; @@ -276,6 +278,7 @@ again: if (udv->u_flags & UVM_DEVICE_WANTED) wakeup(udv); mtx_leave(&udv_lock); + rw_exit(uobj->vmobjlock); uvm_obj_destroy(uobj); free(udv, M_TEMP, sizeof(*udv)); diff --git a/sys/uvm/uvm_fault.c b/sys/uvm/uvm_fault.c index c90d9b3fa81..4cd2efea1ec 100644 --- a/sys/uvm/uvm_fault.c +++ b/sys/uvm/uvm_fault.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_fault.c,v 1.121 2021/10/12 07:38:22 mpi Exp $ */ +/* $OpenBSD: uvm_fault.c,v 1.122 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_fault.c,v 1.51 2000/08/06 00:22:53 thorpej Exp $ */ /* @@ -326,7 +326,8 @@ uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap, if (pg->uobject) { /* Owner of page is UVM object. */ uvmfault_unlockall(ufi, amap, NULL); - tsleep_nsec(pg, PVM, "anonget1", INFSLP); + rwsleep_nsec(pg, pg->uobject->vmobjlock, + PVM | PNORELOCK, "anonget1", INFSLP); } else { /* Owner of page is anon. */ uvmfault_unlockall(ufi, NULL, NULL); @@ -620,6 +621,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vaddr, vm_fault_t fault_type, */ if (uobj != NULL && uobj->pgops->pgo_fault != NULL) { KERNEL_LOCK(); + rw_enter(uobj->vmobjlock, RW_WRITE); error = uobj->pgops->pgo_fault(&ufi, flt.startva, pages, flt.npages, flt.centeridx, fault_type, flt.access_type, @@ -793,10 +795,10 @@ uvm_fault_check(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, voff_t uoff; uoff = (flt->startva - ufi->entry->start) + ufi->entry->offset; - KERNEL_LOCK(); + rw_enter(uobj->vmobjlock, RW_WRITE); (void) uobj->pgops->pgo_flush(uobj, uoff, uoff + ((vsize_t)nback << PAGE_SHIFT), PGO_DEACTIVATE); - KERNEL_UNLOCK(); + rw_exit(uobj->vmobjlock); } /* now forget about the backpages */ @@ -1098,6 +1100,8 @@ uvm_fault_lower_lookup( int lcv, gotpages; vaddr_t currva; + rw_enter(uobj->vmobjlock, RW_WRITE); + counters_inc(uvmexp_counters, flt_lget); gotpages = flt->npages; (void) uobj->pgops->pgo_get(uobj, @@ -1211,6 +1215,14 @@ uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, * made it BUSY. */ + /* + * locked: + */ + KASSERT(amap == NULL || + rw_write_held(amap->am_lock)); + KASSERT(uobj == NULL || + rw_write_held(uobj->vmobjlock)); + /* * note that uobjpage can not be PGO_DONTCARE at this point. we now * set uobjpage to PGO_DONTCARE if we are doing a zero fill. if we @@ -1268,6 +1280,7 @@ uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, return (EIO); uobjpage = PGO_DONTCARE; + uobj = NULL; promote = TRUE; } @@ -1276,6 +1289,12 @@ uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, if (locked && amap != NULL) amap_lock(amap); + /* might be changed */ + if (uobjpage != PGO_DONTCARE) { + uobj = uobjpage->uobject; + rw_enter(uobj->vmobjlock, RW_WRITE); + } + /* * Re-verify that amap slot is still free. if there is * a problem, we clean up. @@ -1300,10 +1319,12 @@ uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, atomic_clearbits_int(&uobjpage->pg_flags, PG_BUSY|PG_WANTED); UVM_PAGE_OWN(uobjpage, NULL); - return ERESTART; } - if (locked == FALSE) + + if (locked == FALSE) { + rw_exit(uobj->vmobjlock); return ERESTART; + } /* * we have the data in uobjpage which is PG_BUSY @@ -1423,6 +1444,7 @@ uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, uvm_lock_pageq(); uvm_pageactivate(uobjpage); uvm_unlock_pageq(); + rw_exit(uobj->vmobjlock); uobj = NULL; } else { counters_inc(uvmexp_counters, flt_przero); @@ -1434,7 +1456,7 @@ uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, if (amap_add(&ufi->entry->aref, ufi->orig_rvaddr - ufi->entry->start, anon, 0)) { - uvmfault_unlockall(ufi, amap, NULL); + uvmfault_unlockall(ufi, amap, uobj); uvm_anfree(anon); counters_inc(uvmexp_counters, flt_noamap); @@ -1483,25 +1505,32 @@ uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, return ERESTART; } - uvm_lock_pageq(); - if (fault_type == VM_FAULT_WIRE) { + uvm_lock_pageq(); uvm_pagewire(pg); + uvm_unlock_pageq(); if (pg->pg_flags & PQ_AOBJ) { /* * since the now-wired page cannot be paged out, * release its swap resources for others to use. - * since an aobj page with no swap cannot be PG_CLEAN, - * clear its clean flag now. + * since an aobj page with no swap cannot be clean, + * mark it dirty now. + * + * use pg->uobject here. if the page is from a + * tmpfs vnode, the pages are backed by its UAO and + * not the vnode. */ + KASSERT(uobj != NULL); + KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock); atomic_clearbits_int(&pg->pg_flags, PG_CLEAN); uao_dropswap(uobj, pg->offset >> PAGE_SHIFT); } } else { /* activate it */ + uvm_lock_pageq(); uvm_pageactivate(pg); + uvm_unlock_pageq(); } - uvm_unlock_pageq(); if (pg->pg_flags & PG_WANTED) wakeup(pg); @@ -1567,7 +1596,7 @@ uvm_fault_unwire(vm_map_t map, vaddr_t start, vaddr_t end) void uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end) { - vm_map_entry_t entry, next; + vm_map_entry_t entry, oentry = NULL, next; pmap_t pmap = vm_map_pmap(map); vaddr_t va; paddr_t pa; @@ -1578,12 +1607,9 @@ uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end) /* * we assume that the area we are unwiring has actually been wired * in the first place. this means that we should be able to extract - * the PAs from the pmap. we also lock out the page daemon so that - * we can call uvm_pageunwire. + * the PAs from the pmap. */ - uvm_lock_pageq(); - /* * find the beginning map entry for the region. */ @@ -1605,6 +1631,17 @@ uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end) entry = next; } + /* + * lock it. + */ + if (entry != oentry) { + if (oentry != NULL) { + uvm_map_unlock_entry(oentry); + } + uvm_map_lock_entry(entry); + oentry = entry; + } + /* * if the entry is no longer wired, tell the pmap. */ @@ -1612,11 +1649,16 @@ uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end) pmap_unwire(pmap, va); pg = PHYS_TO_VM_PAGE(pa); - if (pg) + if (pg) { + uvm_lock_pageq(); uvm_pageunwire(pg); + uvm_unlock_pageq(); + } } - uvm_unlock_pageq(); + if (oentry != NULL) { + uvm_map_unlock_entry(entry); + } } /* @@ -1650,6 +1692,8 @@ void uvmfault_unlockall(struct uvm_faultinfo *ufi, struct vm_amap *amap, struct uvm_object *uobj) { + if (uobj) + rw_exit(uobj->vmobjlock); if (amap != NULL) amap_unlock(amap); uvmfault_unlockmaps(ufi, FALSE); diff --git a/sys/uvm/uvm_km.c b/sys/uvm/uvm_km.c index fc31ae99dff..94c9951dd79 100644 --- a/sys/uvm/uvm_km.c +++ b/sys/uvm/uvm_km.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_km.c,v 1.146 2021/10/24 15:23:52 mpi Exp $ */ +/* $OpenBSD: uvm_km.c,v 1.147 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_km.c,v 1.42 2001/01/14 02:10:01 thorpej Exp $ */ /* @@ -249,13 +249,15 @@ uvm_km_pgremove(struct uvm_object *uobj, vaddr_t startva, vaddr_t endva) int swpgonlydelta = 0; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); + KASSERT(rw_write_held(uobj->vmobjlock)); pmap_remove(pmap_kernel(), startva, endva); for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) { pp = uvm_pagelookup(uobj, curoff); if (pp && pp->pg_flags & PG_BUSY) { atomic_setbits_int(&pp->pg_flags, PG_WANTED); - tsleep_nsec(pp, PVM, "km_pgrm", INFSLP); + rwsleep_nsec(pp, uobj->vmobjlock, PVM, "km_pgrm", + INFSLP); curoff -= PAGE_SIZE; /* loop back to us */ continue; } @@ -383,6 +385,9 @@ uvm_km_kmemalloc_pla(struct vm_map *map, struct uvm_object *obj, vsize_t size, return (0); } + if (obj != NULL) + rw_enter(obj->vmobjlock, RW_WRITE); + loopva = kva; while (loopva != kva + size) { pg = TAILQ_FIRST(&pgl); @@ -409,6 +414,9 @@ uvm_km_kmemalloc_pla(struct vm_map *map, struct uvm_object *obj, vsize_t size, KASSERT(TAILQ_EMPTY(&pgl)); pmap_update(pmap_kernel()); + if (obj != NULL) + rw_exit(obj->vmobjlock); + return kva; } @@ -474,12 +482,14 @@ uvm_km_alloc1(struct vm_map *map, vsize_t size, vsize_t align, boolean_t zeroit) /* now allocate the memory. we must be careful about released pages. */ loopva = kva; while (size) { + rw_enter(uvm.kernel_object->vmobjlock, RW_WRITE); /* allocate ram */ pg = uvm_pagealloc(uvm.kernel_object, offset, NULL, 0); if (pg) { atomic_clearbits_int(&pg->pg_flags, PG_BUSY); UVM_PAGE_OWN(pg, NULL); } + rw_exit(uvm.kernel_object->vmobjlock); if (__predict_false(pg == NULL)) { if (curproc == uvm.pagedaemon_proc) { /* diff --git a/sys/uvm/uvm_map.c b/sys/uvm/uvm_map.c index 4f192f5b5ea..d4e420d4c1e 100644 --- a/sys/uvm/uvm_map.c +++ b/sys/uvm/uvm_map.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_map.c,v 1.280 2021/12/07 18:30:26 deraadt Exp $ */ +/* $OpenBSD: uvm_map.c,v 1.281 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $ */ /* @@ -124,6 +124,8 @@ struct vm_map_entry *uvm_mapent_alloc(struct vm_map*, int); void uvm_mapent_free(struct vm_map_entry*); void uvm_unmap_kill_entry(struct vm_map*, struct vm_map_entry*); +void uvm_unmap_kill_entry_withlock(struct vm_map *, + struct vm_map_entry *, int); void uvm_unmap_detach_intrsafe(struct uvm_map_deadq *); void uvm_mapent_mkfree(struct vm_map*, struct vm_map_entry*, struct vm_map_entry**, @@ -499,6 +501,28 @@ uvm_map_reference(struct vm_map *map) atomic_inc_int(&map->ref_count); } +void +uvm_map_lock_entry(struct vm_map_entry *entry) +{ + if (entry->aref.ar_amap != NULL) { + amap_lock(entry->aref.ar_amap); + } + if (UVM_ET_ISOBJ(entry)) { + rw_enter(entry->object.uvm_obj->vmobjlock, RW_WRITE); + } +} + +void +uvm_map_unlock_entry(struct vm_map_entry *entry) +{ + if (UVM_ET_ISOBJ(entry)) { + rw_exit(entry->object.uvm_obj->vmobjlock); + } + if (entry->aref.ar_amap != NULL) { + amap_unlock(entry->aref.ar_amap); + } +} + /* * Calculate the dused delta. */ @@ -2101,7 +2125,8 @@ uvm_mapent_mkfree(struct vm_map *map, struct vm_map_entry *entry, * Unwire and release referenced amap and object from map entry. */ void -uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry) +uvm_unmap_kill_entry_withlock(struct vm_map *map, struct vm_map_entry *entry, + int needlock) { /* Unwire removed map entry. */ if (VM_MAPENT_ISWIRED(entry)) { @@ -2111,6 +2136,9 @@ uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry) KERNEL_UNLOCK(); } + if (needlock) + uvm_map_lock_entry(entry); + /* Entry-type specific code. */ if (UVM_ET_ISHOLE(entry)) { /* Nothing to be done for holes. */ @@ -2157,17 +2185,19 @@ uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry) */ uvm_km_pgremove(entry->object.uvm_obj, entry->start, entry->end); - - /* - * null out kernel_object reference, we've just - * dropped it - */ - entry->etype &= ~UVM_ET_OBJ; - entry->object.uvm_obj = NULL; /* to be safe */ } else { /* remove mappings the standard way. */ pmap_remove(map->pmap, entry->start, entry->end); } + + if (needlock) + uvm_map_unlock_entry(entry); +} + +void +uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry) +{ + uvm_unmap_kill_entry_withlock(map, entry, 0); } /* @@ -2227,7 +2257,7 @@ uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end, map->sserial++; /* Kill entry. */ - uvm_unmap_kill_entry(map, entry); + uvm_unmap_kill_entry_withlock(map, entry, 1); /* Update space usage. */ if ((map->flags & VM_MAP_ISVMSPACE) && @@ -3420,8 +3450,10 @@ uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, */ iter->wired_count = 0; } + uvm_map_lock_entry(iter); pmap_protect(map->pmap, iter->start, iter->end, iter->protection & mask); + uvm_map_unlock_entry(iter); } /* @@ -3967,11 +3999,13 @@ uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map, */ if (!UVM_ET_ISNEEDSCOPY(old_entry)) { if (old_entry->max_protection & PROT_WRITE) { + uvm_map_lock_entry(old_entry); pmap_protect(old_map->pmap, old_entry->start, old_entry->end, old_entry->protection & ~PROT_WRITE); + uvm_map_unlock_entry(old_entry); pmap_update(old_map->pmap); } old_entry->etype |= UVM_ET_NEEDSCOPY; @@ -4751,9 +4785,11 @@ flush_object: ((flags & PGO_FREE) == 0 || ((entry->max_protection & PROT_WRITE) != 0 && (entry->etype & UVM_ET_COPYONWRITE) == 0))) { + rw_enter(uobj->vmobjlock, RW_WRITE); rv = uobj->pgops->pgo_flush(uobj, cp_start - entry->start + entry->offset, cp_end - entry->start + entry->offset, flags); + rw_exit(uobj->vmobjlock); if (rv == FALSE) error = EFAULT; diff --git a/sys/uvm/uvm_map.h b/sys/uvm/uvm_map.h index 12092ebfcd2..14aeba504bc 100644 --- a/sys/uvm/uvm_map.h +++ b/sys/uvm/uvm_map.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_map.h,v 1.70 2021/05/22 08:38:29 mpi Exp $ */ +/* $OpenBSD: uvm_map.h,v 1.71 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_map.h,v 1.24 2001/02/18 21:19:08 chs Exp $ */ /* @@ -442,6 +442,9 @@ void vm_map_unbusy_ln(struct vm_map*, char*, int); #define vm_map_unbusy(map) vm_map_unbusy_ln(map, NULL, 0) #endif +void uvm_map_lock_entry(struct vm_map_entry *); +void uvm_map_unlock_entry(struct vm_map_entry *); + #endif /* _KERNEL */ /* diff --git a/sys/uvm/uvm_object.c b/sys/uvm/uvm_object.c index 675cd9de2da..838c3adafb2 100644 --- a/sys/uvm/uvm_object.c +++ b/sys/uvm/uvm_object.c @@ -1,7 +1,7 @@ -/* $OpenBSD: uvm_object.c,v 1.22 2021/10/23 14:42:08 mpi Exp $ */ +/* $OpenBSD: uvm_object.c,v 1.23 2021/12/15 12:53:53 mpi Exp $ */ /* - * Copyright (c) 2006 The NetBSD Foundation, Inc. + * Copyright (c) 2006, 2010, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -51,15 +52,27 @@ const struct uvm_pagerops bufcache_pager = { /* nothing */ }; -/* We will fetch this page count per step */ +/* Page count to fetch per single step. */ #define FETCH_PAGECOUNT 16 /* - * uvm_obj_init: initialise a uvm object. + * uvm_obj_init: initialize UVM memory object. */ void uvm_obj_init(struct uvm_object *uobj, const struct uvm_pagerops *pgops, int refs) { + int alock; + + alock = ((pgops != NULL) && (pgops != &pmap_pager) && + (pgops != &bufcache_pager) && (refs != UVM_OBJ_KERN)); + + if (alock) { + /* Allocate and assign a lock. */ + rw_obj_alloc(&uobj->vmobjlock, "uobjlk"); + } else { + /* The lock will need to be set via uvm_obj_setlock(). */ + uobj->vmobjlock = NULL; + } uobj->pgops = pgops; RBT_INIT(uvm_objtree, &uobj->memt); uobj->uo_npages = 0; @@ -73,12 +86,38 @@ void uvm_obj_destroy(struct uvm_object *uo) { KASSERT(RBT_EMPTY(uvm_objtree, &uo->memt)); + + rw_obj_free(uo->vmobjlock); +} + +/* + * uvm_obj_setlock: assign a vmobjlock to the UVM object. + * + * => Caller is responsible to ensure that UVM objects is not use. + * => Only dynamic lock may be previously set. We drop the reference then. + */ +void +uvm_obj_setlock(struct uvm_object *uo, struct rwlock *lockptr) +{ + struct rwlock *olockptr = uo->vmobjlock; + + if (olockptr) { + /* Drop the reference on the old lock. */ + rw_obj_free(olockptr); + } + if (lockptr == NULL) { + /* If new lock is not passed - allocate default one. */ + rw_obj_alloc(&lockptr, "uobjlk"); + } + uo->vmobjlock = lockptr; } #ifndef SMALL_KERNEL /* - * uvm_obj_wire: wire the pages of entire uobj + * uvm_obj_wire: wire the pages of entire UVM object. * + * => NOTE: this function should only be used for types of objects + * where PG_RELEASED flag is never set (aobj objects) * => caller must pass page-aligned start and end values * => if the caller passes in a pageq pointer, we'll return a list of * wired pages. @@ -94,6 +133,7 @@ uvm_obj_wire(struct uvm_object *uobj, voff_t start, voff_t end, left = (end - start) >> PAGE_SHIFT; + rw_enter(uobj->vmobjlock, RW_WRITE); while (left) { npages = MIN(FETCH_PAGECOUNT, left); @@ -107,6 +147,7 @@ uvm_obj_wire(struct uvm_object *uobj, voff_t start, voff_t end, if (error) goto error; + rw_enter(uobj->vmobjlock, RW_WRITE); for (i = 0; i < npages; i++) { KASSERT(pgs[i] != NULL); @@ -134,6 +175,7 @@ uvm_obj_wire(struct uvm_object *uobj, voff_t start, voff_t end, left -= npages; offset += (voff_t)npages << PAGE_SHIFT; } + rw_exit(uobj->vmobjlock); return 0; @@ -145,17 +187,17 @@ error: } /* - * uobj_unwirepages: unwire the pages of entire uobj + * uvm_obj_unwire: unwire the pages of entire UVM object. * * => caller must pass page-aligned start and end values */ - void uvm_obj_unwire(struct uvm_object *uobj, voff_t start, voff_t end) { struct vm_page *pg; off_t offset; + rw_enter(uobj->vmobjlock, RW_WRITE); uvm_lock_pageq(); for (offset = start; offset < end; offset += PAGE_SIZE) { pg = uvm_pagelookup(uobj, offset); @@ -166,6 +208,7 @@ uvm_obj_unwire(struct uvm_object *uobj, voff_t start, voff_t end) uvm_pageunwire(pg); } uvm_unlock_pageq(); + rw_exit(uobj->vmobjlock); } #endif /* !SMALL_KERNEL */ diff --git a/sys/uvm/uvm_object.h b/sys/uvm/uvm_object.h index 9a74600c9df..13ecdda6da2 100644 --- a/sys/uvm/uvm_object.h +++ b/sys/uvm/uvm_object.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_object.h,v 1.28 2021/10/12 18:16:51 kettenis Exp $ */ +/* $OpenBSD: uvm_object.h,v 1.29 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_object.h,v 1.11 2001/03/09 01:02:12 chs Exp $ */ /* @@ -32,14 +32,25 @@ #define _UVM_UVM_OBJECT_H_ /* - * uvm_object.h - */ - -/* - * uvm_object: all that is left of mach objects. + * The UVM memory object interface. Notes: + * + * A UVM memory object represents a list of pages, which are managed by + * the object's pager operations (uvm_object::pgops). All pages belonging + * to an object are owned by it and thus protected by the object lock. + * + * The lock (uvm_object::vmobjlock) may be shared amongst the UVM objects. + * By default, the lock is allocated dynamically using rw_obj_init() cache. + * Lock sharing is normally used when there is an underlying object. For + * example, vnode representing a file may have an underlying node, which + * is the case for tmpfs and layered file systems. In such case, vnode's + * UVM object and the underlying UVM object shares the lock. + * + * The reference count is managed atomically for the anonymous UVM objects. + * For other objects, it is arbitrary (may use the lock or atomics). */ struct uvm_object { + struct rwlock *vmobjlock; /* lock on object */ const struct uvm_pagerops *pgops; /* pager ops */ RBT_HEAD(uvm_objtree, vm_page) memt; /* pages in object */ int uo_npages; /* # of pages in memt */ @@ -52,10 +63,10 @@ struct uvm_object { * memory objects don't have reference counts -- they never die). * * this value is used to detected kernel object mappings at uvm_unmap() - * time. normally when an object is unmapped its pages eventually become - * deactivated and then paged out and/or freed. this is not useful + * time. normally when an object is unmapped its pages eventaully become + * deactivated and then paged out and/or freed. this is not useful * for kernel objects... when a kernel object is unmapped we always want - * to free the resources associated with the mapping. UVM_OBJ_KERN + * to free the resources associated with the mapping. UVM_OBJ_KERN * allows us to decide which type of unmapping we want to do. * * in addition, we have kernel objects which may be used in an @@ -100,8 +111,12 @@ RBT_PROTOTYPE(uvm_objtree, vm_page, objt, uvm_pagecmp) #define UVM_OBJ_IS_BUFCACHE(uobj) \ ((uobj)->pgops == &bufcache_pager) +#define UVM_OBJ_IS_DUMMY(uobj) \ + (UVM_OBJ_IS_PMAP(uobj) || UVM_OBJ_IS_BUFCACHE(uobj)) + void uvm_obj_init(struct uvm_object *, const struct uvm_pagerops *, int); void uvm_obj_destroy(struct uvm_object *); +void uvm_obj_setlock(struct uvm_object *, struct rwlock *); int uvm_obj_wire(struct uvm_object *, voff_t, voff_t, struct pglist *); void uvm_obj_unwire(struct uvm_object *, voff_t, voff_t); void uvm_obj_free(struct uvm_object *); diff --git a/sys/uvm/uvm_page.c b/sys/uvm/uvm_page.c index a90b23af6df..e2b91c87e0c 100644 --- a/sys/uvm/uvm_page.c +++ b/sys/uvm/uvm_page.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_page.c,v 1.159 2021/10/17 11:39:40 patrick Exp $ */ +/* $OpenBSD: uvm_page.c,v 1.160 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_page.c,v 1.44 2000/11/27 08:40:04 chs Exp $ */ /* @@ -118,6 +118,7 @@ static vaddr_t virtual_space_end; */ static void uvm_pageinsert(struct vm_page *); static void uvm_pageremove(struct vm_page *); +int uvm_page_owner_locked_p(struct vm_page *); /* * inline functions @@ -125,7 +126,7 @@ static void uvm_pageremove(struct vm_page *); /* * uvm_pageinsert: insert a page in the object * - * => caller must lock page queues XXX questionable + * => caller must lock object * => call should have already set pg's object and offset pointers * and bumped the version counter */ @@ -134,7 +135,10 @@ uvm_pageinsert(struct vm_page *pg) { struct vm_page *dupe; + KASSERT(UVM_OBJ_IS_DUMMY(pg->uobject) || + rw_write_held(pg->uobject->vmobjlock)); KASSERT((pg->pg_flags & PG_TABLED) == 0); + dupe = RBT_INSERT(uvm_objtree, &pg->uobject->memt, pg); /* not allowed to insert over another page */ KASSERT(dupe == NULL); @@ -145,12 +149,15 @@ uvm_pageinsert(struct vm_page *pg) /* * uvm_page_remove: remove page from object * - * => caller must lock page queues + * => caller must lock object */ static inline void uvm_pageremove(struct vm_page *pg) { + KASSERT(UVM_OBJ_IS_DUMMY(pg->uobject) || + rw_write_held(pg->uobject->vmobjlock)); KASSERT(pg->pg_flags & PG_TABLED); + RBT_REMOVE(uvm_objtree, &pg->uobject->memt, pg); atomic_clearbits_int(&pg->pg_flags, PG_TABLED); @@ -683,11 +690,19 @@ uvm_pagealloc_pg(struct vm_page *pg, struct uvm_object *obj, voff_t off, { int flags; + KASSERT(obj == NULL || anon == NULL); + KASSERT(anon == NULL || off == 0); + KASSERT(off == trunc_page(off)); + KASSERT(obj == NULL || UVM_OBJ_IS_DUMMY(obj) || + rw_write_held(obj->vmobjlock)); + KASSERT(anon == NULL || anon->an_lock == NULL || + rw_write_held(anon->an_lock)); + flags = PG_BUSY | PG_FAKE; pg->offset = off; pg->uobject = obj; pg->uanon = anon; - + KASSERT(uvm_page_owner_locked_p(pg)); if (anon) { anon->an_page = pg; flags |= PQ_ANON; @@ -846,7 +861,9 @@ uvm_pagerealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size, uvm_pagecopy(tpg, pg); KASSERT(tpg->wire_count == 1); tpg->wire_count = 0; + uvm_lock_pageq(); uvm_pagefree(tpg); + uvm_unlock_pageq(); uvm_pagealloc_pg(pg, obj, offset, NULL); } } @@ -873,6 +890,10 @@ uvm_pagealloc(struct uvm_object *obj, voff_t off, struct vm_anon *anon, KASSERT(obj == NULL || anon == NULL); KASSERT(anon == NULL || off == 0); KASSERT(off == trunc_page(off)); + KASSERT(obj == NULL || UVM_OBJ_IS_DUMMY(obj) || + rw_write_held(obj->vmobjlock)); + KASSERT(anon == NULL || anon->an_lock == NULL || + rw_write_held(anon->an_lock)); pmr_flags = UVM_PLA_NOWAIT; @@ -940,10 +961,9 @@ uvm_pageclean(struct vm_page *pg) { u_int flags_to_clear = 0; -#if all_pmap_are_fixed - if (pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) + if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) && + (pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject))) MUTEX_ASSERT_LOCKED(&uvm.pageqlock); -#endif #ifdef DEBUG if (pg->uobject == (void *)0xdeadbeef && @@ -953,6 +973,10 @@ uvm_pageclean(struct vm_page *pg) #endif KASSERT((pg->pg_flags & PG_DEV) == 0); + KASSERT(pg->uobject == NULL || UVM_OBJ_IS_DUMMY(pg->uobject) || + rw_write_held(pg->uobject->vmobjlock)); + KASSERT(pg->uobject != NULL || pg->uanon == NULL || + rw_write_held(pg->uanon->an_lock)); /* * if the page was an object page (and thus "TABLED"), remove it @@ -1009,10 +1033,9 @@ uvm_pageclean(struct vm_page *pg) void uvm_pagefree(struct vm_page *pg) { -#if all_pmap_are_fixed - if (pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) + if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) && + (pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject))) MUTEX_ASSERT_LOCKED(&uvm.pageqlock); -#endif uvm_pageclean(pg); uvm_pmr_freepages(pg, 1); @@ -1037,6 +1060,10 @@ uvm_page_unbusy(struct vm_page **pgs, int npgs) if (pg == NULL || pg == PGO_DONTCARE) { continue; } + + KASSERT(uvm_page_owner_locked_p(pg)); + KASSERT(pg->pg_flags & PG_BUSY); + if (pg->pg_flags & PG_WANTED) { wakeup(pg); } @@ -1207,6 +1234,7 @@ uvm_pagelookup(struct uvm_object *obj, voff_t off) void uvm_pagewire(struct vm_page *pg) { + KASSERT(uvm_page_owner_locked_p(pg)); MUTEX_ASSERT_LOCKED(&uvm.pageqlock); if (pg->wire_count == 0) { @@ -1237,6 +1265,7 @@ uvm_pagewire(struct vm_page *pg) void uvm_pageunwire(struct vm_page *pg) { + KASSERT(uvm_page_owner_locked_p(pg)); MUTEX_ASSERT_LOCKED(&uvm.pageqlock); pg->wire_count--; @@ -1258,6 +1287,7 @@ uvm_pageunwire(struct vm_page *pg) void uvm_pagedeactivate(struct vm_page *pg) { + KASSERT(uvm_page_owner_locked_p(pg)); MUTEX_ASSERT_LOCKED(&uvm.pageqlock); if (pg->pg_flags & PQ_ACTIVE) { @@ -1294,6 +1324,7 @@ uvm_pagedeactivate(struct vm_page *pg) void uvm_pageactivate(struct vm_page *pg) { + KASSERT(uvm_page_owner_locked_p(pg)); MUTEX_ASSERT_LOCKED(&uvm.pageqlock); if (pg->pg_flags & PQ_INACTIVE) { @@ -1341,6 +1372,24 @@ uvm_pagecopy(struct vm_page *src, struct vm_page *dst) pmap_copy_page(src, dst); } +/* + * uvm_page_owner_locked_p: return true if object associated with page is + * locked. this is a weak check for runtime assertions only. + */ +int +uvm_page_owner_locked_p(struct vm_page *pg) +{ + if (pg->uobject != NULL) { + if (UVM_OBJ_IS_DUMMY(pg->uobject)) + return 1; + return rw_write_held(pg->uobject->vmobjlock); + } + if (pg->uanon != NULL) { + return rw_write_held(pg->uanon->an_lock); + } + return 1; +} + /* * uvm_pagecount: count the number of physical pages in the address range. */ diff --git a/sys/uvm/uvm_pager.c b/sys/uvm/uvm_pager.c index 286e7c2a025..ee900527f80 100644 --- a/sys/uvm/uvm_pager.c +++ b/sys/uvm/uvm_pager.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_pager.c,v 1.76 2021/03/26 13:40:05 mpi Exp $ */ +/* $OpenBSD: uvm_pager.c,v 1.77 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_pager.c,v 1.36 2000/11/27 18:26:41 chs Exp $ */ /* @@ -134,6 +134,24 @@ uvm_pseg_get(int flags) int i; struct uvm_pseg *pseg; + /* + * XXX Prevent lock ordering issue in uvm_unmap_detach(). A real + * fix would be to move the KERNEL_LOCK() out of uvm_unmap_detach(). + * + * witness_checkorder() at witness_checkorder+0xba0 + * __mp_lock() at __mp_lock+0x5f + * uvm_unmap_detach() at uvm_unmap_detach+0xc5 + * uvm_map() at uvm_map+0x857 + * uvm_km_valloc_try() at uvm_km_valloc_try+0x65 + * uvm_pseg_get() at uvm_pseg_get+0x6f + * uvm_pagermapin() at uvm_pagermapin+0x45 + * uvn_io() at uvn_io+0xcf + * uvn_get() at uvn_get+0x156 + * uvm_fault_lower() at uvm_fault_lower+0x28a + * uvm_fault() at uvm_fault+0x1b3 + * upageflttrap() at upageflttrap+0x62 + */ + KERNEL_LOCK(); mtx_enter(&uvm_pseg_lck); pager_seg_restart: @@ -159,6 +177,7 @@ pager_seg_restart: if (!UVM_PSEG_INUSE(pseg, i)) { pseg->use |= 1 << i; mtx_leave(&uvm_pseg_lck); + KERNEL_UNLOCK(); return pseg->start + i * MAXBSIZE; } } @@ -171,6 +190,7 @@ pager_seg_fail: } mtx_leave(&uvm_pseg_lck); + KERNEL_UNLOCK(); return 0; } @@ -543,11 +563,15 @@ ReTry: /* XXX daddr_t -> int */ int nswblk = (result == VM_PAGER_AGAIN) ? swblk : 0; if (pg->pg_flags & PQ_ANON) { + rw_enter(pg->uanon->an_lock, RW_WRITE); pg->uanon->an_swslot = nswblk; + rw_exit(pg->uanon->an_lock); } else { + rw_enter(pg->uobject->vmobjlock, RW_WRITE); uao_set_swslot(pg->uobject, pg->offset >> PAGE_SHIFT, nswblk); + rw_exit(pg->uobject->vmobjlock); } } if (result == VM_PAGER_AGAIN) { @@ -612,6 +636,8 @@ uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg, { int lcv; + KASSERT(uobj == NULL || rw_write_held(uobj->vmobjlock)); + /* drop all pages but "pg" */ for (lcv = 0 ; lcv < *npages ; lcv++) { /* skip "pg" or empty slot */ @@ -625,10 +651,13 @@ uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg, */ if (!uobj) { if (ppsp[lcv]->pg_flags & PQ_ANON) { + rw_enter(ppsp[lcv]->uanon->an_lock, RW_WRITE); if (flags & PGO_REALLOCSWAP) /* zap swap block */ ppsp[lcv]->uanon->an_swslot = 0; } else { + rw_enter(ppsp[lcv]->uobject->vmobjlock, + RW_WRITE); if (flags & PGO_REALLOCSWAP) uao_set_swslot(ppsp[lcv]->uobject, ppsp[lcv]->offset >> PAGE_SHIFT, 0); @@ -649,7 +678,6 @@ uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg, UVM_PAGE_OWN(ppsp[lcv], NULL); /* kills anon and frees pg */ - rw_enter(ppsp[lcv]->uanon->an_lock, RW_WRITE); uvm_anon_release(ppsp[lcv]->uanon); continue; @@ -672,6 +700,14 @@ uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg, pmap_clear_modify(ppsp[lcv]); atomic_setbits_int(&ppsp[lcv]->pg_flags, PG_CLEAN); } + + /* if anonymous cluster, unlock object and move on */ + if (!uobj) { + if (ppsp[lcv]->pg_flags & PQ_ANON) + rw_exit(ppsp[lcv]->uanon->an_lock); + else + rw_exit(ppsp[lcv]->uobject->vmobjlock); + } } } @@ -736,6 +772,7 @@ uvm_aio_aiodone(struct buf *bp) swap = (pg->pg_flags & PQ_SWAPBACKED) != 0; if (!swap) { uobj = pg->uobject; + rw_enter(uobj->vmobjlock, RW_WRITE); } } KASSERT(swap || pg->uobject == uobj); @@ -763,6 +800,9 @@ uvm_aio_aiodone(struct buf *bp) } } uvm_page_unbusy(pgs, npages); + if (!swap) { + rw_exit(uobj->vmobjlock); + } #ifdef UVM_SWAP_ENCRYPT freed: diff --git a/sys/uvm/uvm_pdaemon.c b/sys/uvm/uvm_pdaemon.c index e0ab150cddc..fc4b7ba5b20 100644 --- a/sys/uvm/uvm_pdaemon.c +++ b/sys/uvm/uvm_pdaemon.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_pdaemon.c,v 1.93 2021/06/29 01:46:35 jsg Exp $ */ +/* $OpenBSD: uvm_pdaemon.c,v 1.94 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_pdaemon.c,v 1.23 2000/08/20 10:24:14 bjh21 Exp $ */ /* @@ -440,19 +440,6 @@ uvmpd_scan_inactive(struct pglist *pglst) uvmexp.pdscans++; nextpg = TAILQ_NEXT(p, pageq); - /* - * move referenced pages back to active queue and - * skip to next page (unlikely to happen since - * inactive pages shouldn't have any valid mappings - * and we cleared reference before deactivating). - */ - - if (pmap_is_referenced(p)) { - uvm_pageactivate(p); - uvmexp.pdreact++; - continue; - } - if (p->pg_flags & PQ_ANON) { anon = p->uanon; KASSERT(anon != NULL); @@ -461,6 +448,16 @@ uvmpd_scan_inactive(struct pglist *pglst) /* lock failed, skip this page */ continue; } + /* + * move referenced pages back to active queue + * and skip to next page. + */ + if (pmap_is_referenced(p)) { + uvm_pageactivate(p); + rw_exit(anon->an_lock); + uvmexp.pdreact++; + continue; + } if (p->pg_flags & PG_BUSY) { rw_exit(anon->an_lock); uvmexp.pdbusy++; @@ -471,7 +468,23 @@ uvmpd_scan_inactive(struct pglist *pglst) } else { uobj = p->uobject; KASSERT(uobj != NULL); + if (rw_enter(uobj->vmobjlock, + RW_WRITE|RW_NOSLEEP)) { + /* lock failed, skip this page */ + continue; + } + /* + * move referenced pages back to active queue + * and skip to next page. + */ + if (pmap_is_referenced(p)) { + uvm_pageactivate(p); + rw_exit(uobj->vmobjlock); + uvmexp.pdreact++; + continue; + } if (p->pg_flags & PG_BUSY) { + rw_exit(uobj->vmobjlock); uvmexp.pdbusy++; /* someone else owns page, skip it */ continue; @@ -507,6 +520,8 @@ uvmpd_scan_inactive(struct pglist *pglst) /* remove from object */ anon->an_page = NULL; rw_exit(anon->an_lock); + } else { + rw_exit(uobj->vmobjlock); } continue; } @@ -518,6 +533,8 @@ uvmpd_scan_inactive(struct pglist *pglst) if (free + uvmexp.paging > uvmexp.freetarg << 2) { if (anon) { rw_exit(anon->an_lock); + } else { + rw_exit(uobj->vmobjlock); } continue; } @@ -533,6 +550,8 @@ uvmpd_scan_inactive(struct pglist *pglst) uvm_pageactivate(p); if (anon) { rw_exit(anon->an_lock); + } else { + rw_exit(uobj->vmobjlock); } continue; } @@ -602,6 +621,9 @@ uvmpd_scan_inactive(struct pglist *pglst) UVM_PAGE_OWN(p, NULL); if (anon) rw_exit(anon->an_lock); + else + rw_exit( + uobj->vmobjlock); continue; } swcpages = 0; /* cluster is empty */ @@ -635,6 +657,8 @@ uvmpd_scan_inactive(struct pglist *pglst) if (p) { /* if we just added a page to cluster */ if (anon) rw_exit(anon->an_lock); + else + rw_exit(uobj->vmobjlock); /* cluster not full yet? */ if (swcpages < swnpages) @@ -748,6 +772,8 @@ uvmpd_scan_inactive(struct pglist *pglst) if (swap_backed) { if (anon) rw_enter(anon->an_lock, RW_WRITE); + else + rw_enter(uobj->vmobjlock, RW_WRITE); } #ifdef DIAGNOSTIC @@ -810,6 +836,8 @@ uvmpd_scan_inactive(struct pglist *pglst) */ if (anon) rw_exit(anon->an_lock); + else if (uobj) + rw_exit(uobj->vmobjlock); if (nextpg && (nextpg->pg_flags & PQ_INACTIVE) == 0) { nextpg = TAILQ_FIRST(pglst); /* reload! */ @@ -920,8 +948,12 @@ uvmpd_scan(void) KASSERT(p->uanon != NULL); if (rw_enter(p->uanon->an_lock, RW_WRITE|RW_NOSLEEP)) continue; - } else + } else { KASSERT(p->uobject != NULL); + if (rw_enter(p->uobject->vmobjlock, + RW_WRITE|RW_NOSLEEP)) + continue; + } /* * if there's a shortage of swap, free any swap allocated @@ -959,6 +991,8 @@ uvmpd_scan(void) } if (p->pg_flags & PQ_ANON) rw_exit(p->uanon->an_lock); + else + rw_exit(p->uobject->vmobjlock); } } @@ -982,6 +1016,10 @@ uvmpd_drop(struct pglist *pglst) continue; if (p->pg_flags & PG_CLEAN) { + struct uvm_object * uobj = p->uobject; + + rw_enter(uobj->vmobjlock, RW_WRITE); + uvm_lock_pageq(); /* * we now have the page queues locked. * the page is not busy. if the page is clean we @@ -997,6 +1035,8 @@ uvmpd_drop(struct pglist *pglst) pmap_page_protect(p, PROT_NONE); uvm_pagefree(p); } + uvm_unlock_pageq(); + rw_exit(uobj->vmobjlock); } } } @@ -1004,13 +1044,9 @@ uvmpd_drop(struct pglist *pglst) void uvmpd_hibernate(void) { - uvm_lock_pageq(); - uvmpd_drop(&uvm.page_inactive_swp); uvmpd_drop(&uvm.page_inactive_obj); uvmpd_drop(&uvm.page_active); - - uvm_unlock_pageq(); } #endif diff --git a/sys/uvm/uvm_vnode.c b/sys/uvm/uvm_vnode.c index ed1ba9fc870..c9e7dd63288 100644 --- a/sys/uvm/uvm_vnode.c +++ b/sys/uvm/uvm_vnode.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_vnode.c,v 1.120 2021/12/07 02:58:46 cheloha Exp $ */ +/* $OpenBSD: uvm_vnode.c,v 1.121 2021/12/15 12:53:53 mpi Exp $ */ /* $NetBSD: uvm_vnode.c,v 1.36 2000/11/24 20:34:01 chs Exp $ */ /* @@ -280,8 +280,9 @@ uvn_reference(struct uvm_object *uobj) panic("uvn_reference: invalid state"); } #endif - KERNEL_ASSERT_LOCKED(); + rw_enter(uobj->vmobjlock, RW_WRITE); uobj->uo_refs++; + rw_exit(uobj->vmobjlock); } /* @@ -300,9 +301,10 @@ uvn_detach(struct uvm_object *uobj) struct vnode *vp; int oldflags; - KERNEL_ASSERT_LOCKED(); + rw_enter(uobj->vmobjlock, RW_WRITE); uobj->uo_refs--; /* drop ref! */ if (uobj->uo_refs) { /* still more refs */ + rw_exit(uobj->vmobjlock); return; } @@ -323,8 +325,7 @@ uvn_detach(struct uvm_object *uobj) if (uvn->u_flags & UVM_VNODE_CANPERSIST) { /* won't block */ uvn_flush(uobj, 0, 0, PGO_DEACTIVATE|PGO_ALLPAGES); - vrele(vp); /* drop vnode reference */ - return; + goto out; } /* its a goner! */ @@ -353,7 +354,8 @@ uvn_detach(struct uvm_object *uobj) /* wait on any outstanding io */ while (uobj->uo_npages && uvn->u_flags & UVM_VNODE_RELKILL) { uvn->u_flags |= UVM_VNODE_IOSYNC; - tsleep_nsec(&uvn->u_nio, PVM, "uvn_term", INFSLP); + rwsleep_nsec(&uvn->u_nio, uobj->vmobjlock, PVM, "uvn_term", + INFSLP); } if ((uvn->u_flags & UVM_VNODE_RELKILL) == 0) @@ -373,6 +375,8 @@ uvn_detach(struct uvm_object *uobj) /* wake up any sleepers */ if (oldflags & UVM_VNODE_WANTED) wakeup(uvn); +out: + rw_exit(uobj->vmobjlock); /* drop our reference to the vnode. */ vrele(vp); @@ -409,10 +413,13 @@ void uvm_vnp_terminate(struct vnode *vp) { struct uvm_vnode *uvn = vp->v_uvm; + struct uvm_object *uobj = &uvn->u_obj; int oldflags; /* check if it is valid */ + rw_enter(uobj->vmobjlock, RW_WRITE); if ((uvn->u_flags & UVM_VNODE_VALID) == 0) { + rw_exit(uobj->vmobjlock); return; } @@ -479,7 +486,8 @@ uvm_vnp_terminate(struct vnode *vp) */ #endif uvn->u_flags |= UVM_VNODE_IOSYNC; - tsleep_nsec(&uvn->u_nio, PVM, "uvn_term", INFSLP); + rwsleep_nsec(&uvn->u_nio, uobj->vmobjlock, PVM, "uvn_term", + INFSLP); } /* @@ -512,6 +520,8 @@ uvm_vnp_terminate(struct vnode *vp) if (oldflags & UVM_VNODE_WANTED) wakeup(uvn); + + rw_exit(uobj->vmobjlock); } /* @@ -589,7 +599,7 @@ uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags) boolean_t retval, need_iosync, needs_clean; voff_t curoff; - KERNEL_ASSERT_LOCKED(); + KASSERT(rw_write_held(uobj->vmobjlock)); TAILQ_INIT(&dead); /* get init vals and determine how we are going to traverse object */ @@ -673,8 +683,8 @@ uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags) atomic_setbits_int(&pp->pg_flags, PG_WANTED); uvm_unlock_pageq(); - tsleep_nsec(pp, PVM, "uvn_flsh", - INFSLP); + rwsleep_nsec(pp, uobj->vmobjlock, PVM, + "uvn_flsh", INFSLP); uvm_lock_pageq(); curoff -= PAGE_SIZE; continue; @@ -824,7 +834,8 @@ ReTry: if (need_iosync) { while (uvn->u_nio != 0) { uvn->u_flags |= UVM_VNODE_IOSYNC; - tsleep_nsec(&uvn->u_nio, PVM, "uvn_flush", INFSLP); + rwsleep_nsec(&uvn->u_nio, uobj->vmobjlock, PVM, + "uvn_flush", INFSLP); } if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED) wakeup(&uvn->u_flags); @@ -878,7 +889,7 @@ uvn_put(struct uvm_object *uobj, struct vm_page **pps, int npages, int flags) { int retval; - KERNEL_ASSERT_LOCKED(); + KASSERT(rw_write_held(uobj->vmobjlock)); retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE); @@ -903,7 +914,8 @@ uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, int lcv, result, gotpages; boolean_t done; - KERNEL_ASSERT_LOCKED(); + KASSERT(((flags & PGO_LOCKED) != 0 && rw_lock_held(uobj->vmobjlock)) || + (flags & PGO_LOCKED) == 0); /* step 1: handled the case where fault data structures are locked. */ if (flags & PGO_LOCKED) { @@ -1033,7 +1045,8 @@ uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, /* page is there, see if we need to wait on it */ if ((ptmp->pg_flags & PG_BUSY) != 0) { atomic_setbits_int(&ptmp->pg_flags, PG_WANTED); - tsleep_nsec(ptmp, PVM, "uvn_get", INFSLP); + rwsleep_nsec(ptmp, uobj->vmobjlock, PVM, + "uvn_get", INFSLP); continue; /* goto top of pps while loop */ } @@ -1077,6 +1090,7 @@ uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, uvm_lock_pageq(); uvm_pagefree(ptmp); uvm_unlock_pageq(); + rw_exit(uobj->vmobjlock); return result; } @@ -1098,6 +1112,8 @@ uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, } + + rw_exit(uobj->vmobjlock); return (VM_PAGER_OK); } @@ -1113,6 +1129,7 @@ uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, int uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) { + struct uvm_object *uobj = &uvn->u_obj; struct vnode *vn; struct uio uio; struct iovec iov; @@ -1123,6 +1140,8 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) int netunlocked = 0; int lkflags = (flags & PGO_NOWAIT) ? LK_NOWAIT : 0; + KASSERT(rw_write_held(uobj->vmobjlock)); + /* init values */ waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT; vn = uvn->u_vnode; @@ -1134,7 +1153,8 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) return VM_PAGER_AGAIN; } uvn->u_flags |= UVM_VNODE_IOSYNCWANTED; - tsleep_nsec(&uvn->u_flags, PVM, "uvn_iosync", INFSLP); + rwsleep_nsec(&uvn->u_flags, uobj->vmobjlock, PVM, "uvn_iosync", + INFSLP); } /* check size */ @@ -1157,6 +1177,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) * (this time with sleep ok). */ uvn->u_nio++; /* we have an I/O in progress! */ + rw_exit(uobj->vmobjlock); if (kva == 0) kva = uvm_pagermapin(pps, npages, mapinflags | UVMPAGER_MAPIN_WAITOK); @@ -1200,6 +1221,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) * Ideally, this kind of operation *should* work. */ result = 0; + KERNEL_LOCK(); if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL | lkflags); if (result == 0) { @@ -1215,6 +1237,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) VOP_UNLOCK(vn); } + KERNEL_UNLOCK(); if (netunlocked) NET_LOCK(); @@ -1241,6 +1264,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) uvm_pagermapout(kva, npages); /* now clean up the object (i.e. drop I/O count) */ + rw_enter(uobj->vmobjlock, RW_WRITE); uvn->u_nio--; /* I/O DONE! */ if ((uvn->u_flags & UVM_VNODE_IOSYNC) != 0 && uvn->u_nio == 0) { wakeup(&uvn->u_nio); @@ -1252,8 +1276,12 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) KASSERT(flags & PGO_NOWAIT); return VM_PAGER_AGAIN; } else { - while (rebooting) - tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP); + if (rebooting) { + KERNEL_LOCK(); + while (rebooting) + tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP); + KERNEL_UNLOCK(); + } return VM_PAGER_ERROR; } } @@ -1300,11 +1328,14 @@ int uvm_vnp_uncache(struct vnode *vp) { struct uvm_vnode *uvn = vp->v_uvm; + struct uvm_object *uobj = &uvn->u_obj; /* lock uvn part of the vnode and check if we need to do anything */ + rw_enter(uobj->vmobjlock, RW_WRITE); if ((uvn->u_flags & UVM_VNODE_VALID) == 0 || (uvn->u_flags & UVM_VNODE_BLOCKED) != 0) { + rw_exit(uobj->vmobjlock); return TRUE; } @@ -1314,6 +1345,7 @@ uvm_vnp_uncache(struct vnode *vp) */ uvn->u_flags &= ~UVM_VNODE_CANPERSIST; if (uvn->u_obj.uo_refs) { + rw_exit(uobj->vmobjlock); return FALSE; } @@ -1323,6 +1355,7 @@ uvm_vnp_uncache(struct vnode *vp) */ vref(vp); /* seems ok, even with VOP_LOCK */ uvn->u_obj.uo_refs++; /* value is now 1 */ + rw_exit(uobj->vmobjlock); #ifdef VFSLCKDEBUG /* @@ -1374,6 +1407,11 @@ void uvm_vnp_setsize(struct vnode *vp, off_t newsize) { struct uvm_vnode *uvn = vp->v_uvm; + struct uvm_object *uobj = &uvn->u_obj; + + KERNEL_ASSERT_LOCKED(); + + rw_enter(uobj->vmobjlock, RW_WRITE); /* lock uvn and check for valid object, and if valid: do it! */ if (uvn->u_flags & UVM_VNODE_VALID) { @@ -1389,6 +1427,7 @@ uvm_vnp_setsize(struct vnode *vp, off_t newsize) } uvn->u_size = newsize; } + rw_exit(uobj->vmobjlock); } /* @@ -1447,6 +1486,7 @@ uvm_vnp_sync(struct mount *mp) /* step 3: we now have a list of uvn's that may need cleaning. */ SIMPLEQ_FOREACH(uvn, &uvn_sync_q, u_syncq) { + rw_enter(uvn->u_obj.vmobjlock, RW_WRITE); #ifdef DEBUG if (uvn->u_flags & UVM_VNODE_DYING) { printf("uvm_vnp_sync: dying vnode on sync list\n"); @@ -1465,6 +1505,7 @@ uvm_vnp_sync(struct mount *mp) LIST_REMOVE(uvn, u_wlist); uvn->u_flags &= ~UVM_VNODE_WRITEABLE; } + rw_exit(uvn->u_obj.vmobjlock); /* now drop our reference to the uvn */ uvn_detach(&uvn->u_obj);