提交 1087d6c6 编写于 作者: O openeuler-ci-bot 提交者: Gitee
obj-m := kvm_ept_idle.o
kvm_ept_idle-y := ept_idle.o ept_idle_native_pagewalk.o tlb_flush.o
include Kbuild
# KERNEL_SRC_DIR=/lib/modules/$(shell uname -r)/build
# MODULE_DIR=$(shell pwd)
default:
$(MAKE) -C $(KERNEL_SRC_DIR) M=$(MODULE_DIR) modules
clean:
rm -f *.o *.ko *.mod.c modules.order Module.symvers
# memory-scan
#### Description
A kernel module for scaning page table of process/VMs
#### Software Architecture
Software architecture description
memory-scan is derived from memory-optimizer which is located in https://github.com/intel/memory-optimizer/tree/master/kernel_module. memory-scan is a kernel module for scaning page table of process/VMs.
#### Installation
1. xxxx
2. xxxx
3. xxxx
#### Instructions
1. xxxx
2. xxxx
3. xxxx
OpenEuler users can use memory-scan:
1. make
2. insmod memory_scan.ko
#### Contribution
......
# memory-scan
#### 介绍
A kernel module for scaning page table of process/VMs
#### 软件架构
软件架构说明
memory-scan源于memory-optimizer(https://github.com/intel/memory-optimizer/tree/master/kernel_module)。memory-scan是一个用于扫描进程或虚拟机的内存页面访问情况的内核模块。
#### 安装教程
1. xxxx
2. xxxx
3. xxxx
#### 使用说明
1. xxxx
2. xxxx
3. xxxx
OpenEuler用户可以通过以下方法使用memory-scan:
1. make
2. insmod memory_scan.ko
#### 参与贡献
......
此差异已折叠。
#ifndef _EPT_IDLE_H
#define _EPT_IDLE_H
#include "ept_idle_common.h"
#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */
#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */
#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */
enum ProcIdlePageType {
PTE_ACCESSED, /* 4k page */
PMD_ACCESSED, /* 2M page */
PUD_PRESENT, /* 1G page */
PTE_DIRTY,
PMD_DIRTY,
PTE_IDLE,
PMD_IDLE,
PMD_IDLE_PTES, /* all PTE idle */
PTE_HOLE,
PMD_HOLE,
PIP_CMD,
IDLE_PAGE_TYPE_MAX
};
#define PIP_TYPE(a) (0xf & (a >> 4))
#define PIP_SIZE(a) (0xf & a)
#define PIP_COMPOSE(type, nr) ((type << 4) | nr)
#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0)
#define _PAGE_BIT_EPT_ACCESSED 8
#define _PAGE_BIT_EPT_DIRTY 9
#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED)
#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY)
#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7))
static inline int ept_pte_present(pte_t a)
{
return pte_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pmd_present(pmd_t a)
{
return pmd_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pud_present(pud_t a)
{
return pud_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_p4d_present(p4d_t a)
{
return p4d_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pgd_present(pgd_t a)
{
return pgd_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pte_accessed(pte_t a)
{
return pte_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_pmd_accessed(pmd_t a)
{
return pmd_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_pud_accessed(pud_t a)
{
return pud_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_p4d_accessed(p4d_t a)
{
return p4d_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_pgd_accessed(pgd_t a)
{
return pgd_flags(a) & _PAGE_EPT_ACCESSED;
}
extern struct file_operations proc_ept_idle_operations;
#define EPT_IDLE_KBUF_FULL 1
#define EPT_IDLE_BUF_FULL 2
#define EPT_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3)
#define EPT_IDLE_KBUF_SIZE 8000
#define IDLE_PAGE_SET_PID _IOW(0x1, 0x1, pid_t)
struct ept_idle_ctrl {
struct mm_struct *mm;
struct kvm *kvm;
uint8_t kpie[EPT_IDLE_KBUF_SIZE];
int pie_read;
int pie_read_max;
void __user *buf;
int buf_size;
int bytes_copied;
unsigned long next_hva; /* GPA for EPT; VA for PT */
unsigned long gpa_to_hva;
unsigned long restart_gpa;
unsigned long last_va;
unsigned int flags;
};
#endif
// SPDX-License-Identifier: GPL-2.0
#ifndef _EPT_IDLE_COMMON_H
#define _EPT_IDLE_COMMON_H
/* Fix leak of 5 level paging supporting on old kernel*/
#ifndef CONFIG_PGTABLE_LEVELS
#define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
#else
#if CONFIG_PGTABLE_LEVELS < 4
#define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
#endif // #if CONFIG_PGTABLE_LEVELS < 4
#endif // #ifndef CONFIG_PGTABLE_LEVELS
#ifdef EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
#define p4d_t pgd_t
#define p4d_flags pgd_flags
#define p4d_offset(pgd, start) (pgd)
#define p4d_addr_end(addr, end) (end)
#define p4d_present(p4d) 1
#define p4d_ERROR(p4d) do { } while(0)
#define p4d_clear pgd_clear
#define p4d_none(p4d) 0
#define p4d_bad(p4d) 0
#define p4d_clear_bad pgd_clear_bad
#endif
#ifndef pgd_offset_pgd
#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
#endif
#endif
// SPDX-License-Identifier: GPL-2.0
// Copied from kernel mm/pagewalk.c, modified by yuan.yao@intel.com
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
#include "ept_idle_common.h"
#ifdef CONFIG_HUGETLB_PAGE
int pmd_huge(pmd_t pmd)
{
return !pmd_none(pmd) &&
(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
}
int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
/*
* ept_idle_huge_pte_offset() - Walk the page table to resolve the hugepage
* entry at address @addr
*
* Return: Pointer to page table or swap entry (PUD or PMD) for
* address @addr, or NULL if a p*d_none() entry is encountered and the
* size @sz doesn't match the hugepage size at this level of the page
* table.
*/
pte_t *ept_idle_huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
return NULL;
p4d = p4d_offset(pgd, addr);
if (!p4d_present(*p4d))
return NULL;
pud = pud_offset(p4d, addr);
if (sz != PUD_SIZE && pud_none(*pud))
return NULL;
/* hugepage or swap? */
if (pud_huge(*pud) || !pud_present(*pud))
return (pte_t *)pud;
pmd = pmd_offset(pud, addr);
if (sz != PMD_SIZE && pmd_none(*pmd))
return NULL;
/* hugepage or swap? */
if (pmd_huge(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
return NULL;
}
#else // #ifdef CONFIG_HUGETLB_PAGE
#define pud_huge(x) 0
#define pmd_huge(x) 0
#define ept_idle_huge_pte_offset(mm, address, sz) 0
#endif
#ifndef VM_BUG_ON_VMA
#define VM_BUG_ON_VMA(cond, vma) \
do { \
if (unlikely(cond)) { \
BUG(); \
} \
} while (0)
#endif
#ifndef VM_BUG_ON_MM
#define VM_BUG_ON_MM VM_BUG_ON_VMA
#endif
static inline int ept_idle_p4d_none_or_clear_bad(p4d_t *p4d)
{
if (p4d_none(*p4d))
return 1;
if (unlikely(p4d_bad(*p4d))) {
p4d_clear_bad(p4d);
return 1;
}
return 0;
}
static inline spinlock_t *ept_idle_pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
spinlock_t *ptl;
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
ptl = pud_lock(vma->vm_mm, pud);
if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
return ptl;
spin_unlock(ptl);
return NULL;
}
void p4d_clear_bad(p4d_t *p4d)
{
p4d_ERROR(*p4d);
p4d_clear(p4d);
}
void pmd_clear_bad(pmd_t *pmd)
{
pmd_ERROR(*pmd);
pmd_clear(pmd);
}
#ifdef _EPT_IDLE_SPLIT_PMD_
static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pte_t *pte;
int err = 0;
pte = pte_offset_map(pmd, addr);
for (;;) {
err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
addr += PAGE_SIZE;
if (addr == end)
break;
pte++;
}
pte_unmap(pte);
return err;
}
#endif
static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pmd_t *pmd;
unsigned long next;
int err = 0;
pmd = pmd_offset(pud, addr);
do {
#ifdef _EPT_IDLE_SPLIT_PMD_
again:
#endif
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd) || !walk->vma) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
/*
* This implies that each ->pmd_entry() handler
* needs to know about pmd_trans_huge() pmds
*/
if (walk->pmd_entry)
err = walk->pmd_entry(pmd, addr, next, walk);
if (err)
break;
#ifdef _EPT_IDLE_SPLIT_PMD_
/*
* Check this here so we only break down trans_huge
* pages when we _need_ to
*/
if (!walk->pte_entry)
continue;
split_huge_pmd(walk->vma, pmd, addr);
if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
#endif
} while (pmd++, addr = next, addr != end);
return err;
}
static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pud_t *pud;
unsigned long next;
int err = 0;
pud = pud_offset(p4d, addr);
do {
#ifdef _EPT_IDLE_SPLIT_PUD_
again:
#endif
next = pud_addr_end(addr, end);
if (pud_none(*pud) || !walk->vma) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pud_entry) {
spinlock_t *ptl = ept_idle_pud_trans_huge_lock(pud, walk->vma);
if (ptl) {
err = walk->pud_entry(pud, addr, next, walk);
spin_unlock(ptl);
if (err)
break;
continue;
}
}
#ifdef _EPT_IDLE_SPLIT_PUD_
split_huge_pud(walk->vma, pud, addr);
if (pud_none(*pud))
goto again;
#endif
if (walk->pmd_entry || walk->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
return err;
}
static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
p4d_t *p4d;
unsigned long next;
int err = 0;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (ept_idle_p4d_none_or_clear_bad(p4d)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pmd_entry || walk->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
} while (p4d++, addr = next, addr != end);
return err;
}
static int walk_pgd_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pgd_t *pgd;
unsigned long next;
int err = 0;
pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pmd_entry || walk->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
} while (pgd++, addr = next, addr != end);
return err;
}
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
unsigned long end)
{
unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
return boundary < end ? boundary : end;
}
static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct hstate *h = hstate_vma(vma);
unsigned long next;
unsigned long hmask = huge_page_mask(h);
unsigned long sz = huge_page_size(h);
pte_t *pte;
int err = 0;
do {
next = hugetlb_entry_end(h, addr, end);
pte = ept_idle_huge_pte_offset(walk->mm, addr & hmask, sz);
if (pte)
err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
else if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
} while (addr = next, addr != end);
return err;
}
#else /* CONFIG_HUGETLB_PAGE */
static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
return 0;
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
* Decide whether we really walk over the current vma on [@start, @end)
* or skip it via the returned value. Return 0 if we do walk over the
* current vma, and return 1 if we skip the vma. Negative values means
* error, where we abort the current walk.
*/
static int walk_page_test(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
if (walk->test_walk)
return walk->test_walk(start, end, walk);
/*
* vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
* range, so we don't walk over it as we do for normal vmas. However,
* Some callers are interested in handling hole range and they don't
* want to just ignore any single address range. Such users certainly
* define their ->pte_hole() callbacks, so let's delegate them to handle
* vma(VM_PFNMAP).
*/
if (vma->vm_flags & VM_PFNMAP) {
int err = 1;
if (walk->pte_hole)
err = walk->pte_hole(start, end, walk);
return err ? err : 1;
}
return 0;
}
static int __walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
int err = 0;
struct vm_area_struct *vma = walk->vma;
if (vma && is_vm_hugetlb_page(vma)) {
if (walk->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
return err;
}
/**
* walk_page_range - walk page table with caller specific callbacks
* @start: start address of the virtual address range
* @end: end address of the virtual address range
* @walk: mm_walk structure defining the callbacks and the target address space
*
* Recursively walk the page table tree of the process represented by @walk->mm
* within the virtual address range [@start, @end). During walking, we can do
* some caller-specific works for each entry, by setting up pmd_entry(),
* pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
* callbacks, the associated entries/pages are just ignored.
* The return values of these callbacks are commonly defined like below:
*
* - 0 : succeeded to handle the current entry, and if you don't reach the
* end address yet, continue to walk.
* - >0 : succeeded to handle the current entry, and return to the caller
* with caller specific value.
* - <0 : failed to handle the current entry, and return to the caller
* with error code.
*
* Before starting to walk page table, some callers want to check whether
* they really want to walk over the current vma, typically by checking
* its vm_flags. walk_page_test() and @walk->test_walk() are used for this
* purpose.
*
* struct mm_walk keeps current values of some common data like vma and pmd,
* which are useful for the access from callbacks. If you want to pass some
* caller-specific data to callbacks, @walk->private should be helpful.
*
* Locking:
* Callers of walk_page_range() and walk_page_vma() should hold
* @walk->mm->mmap_sem, because these function traverse vma list and/or
* access to vma's data.
*/
int ept_idle_walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
int err = 0;
unsigned long next;
struct vm_area_struct *vma;
if (start >= end)
return -EINVAL;
if (!walk->mm)
return -EINVAL;
VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
vma = find_vma(walk->mm, start);
do {
if (!vma) { /* after the last vma */
walk->vma = NULL;
next = end;
} else if (start < vma->vm_start) { /* outside vma */
walk->vma = NULL;
next = min(end, vma->vm_start);
} else { /* inside vma */
walk->vma = vma;
next = min(end, vma->vm_end);
vma = vma->vm_next;
err = walk_page_test(start, next, walk);
if (err > 0) {
/*
* positive return values are purely for
* controlling the pagewalk, so should never
* be passed to the callers.
*/
err = 0;
continue;
}
if (err < 0)
break;
}
if (walk->vma || walk->pte_hole)
err = __walk_page_range(start, next, walk);
if (err)
break;
} while (start = next, start < end);
return err;
}
#ifndef _EPT_IDLE_NATIVE_PAGEWALK_H
#define _EPT_IDLE_NATIVE_PAGEWALK_H
int ept_idle_walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk);
#endif
#include "tlb_flush.h"
/* copied from 4.20 kernel:
* See Documentation/x86/tlb.txt for details. We choose 33
* because it is large enough to cover the vast majority (at
* least 95%) of allocations, and is small enough that we are
* confident it will not cause too much overhead. Each single
* flush is about 100 ns, so this caps the maximum overhead at
* _about_ 3,000 ns.
*
* This is in units of pages.
*/
static unsigned long copied_tlb_single_page_flush_ceiling __read_mostly = 33;
static bool copied_tlb_is_not_lazy(int cpu, void *data)
{
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
/*
* flush_tlb_func_common()'s memory ordering requirement is that any
* TLB fills that happen after we flush the TLB are ordered after we
* read active_mm's tlb_gen. We don't need any explicit barriers
* because all x86 flush operations are serializing and the
* atomic64_read operation won't be reordered by the compiler.
*/
static void copied_flush_tlb_func_common(const struct flush_tlb_info *f,
bool local, enum tlb_flush_reason reason)
{
/*
* We have three different tlb_gen values in here. They are:
*
* - mm_tlb_gen: the latest generation.
* - local_tlb_gen: the generation that this CPU has already caught
* up to.
* - f->new_tlb_gen: the generation that the requester of the flush
* wants us to catch up to.
*/
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
/*
* The init_mm is unexported variable, but we don't need
* check this here for our case, we just want to flush
* the TLB on remote CPU cores which is running the task
* using f->mm as memory space
*/
#if 0
if (unlikely(loaded_mm == &init_mm))
return;
#else
if (unlikely(loaded_mm != f->mm)) {
return;
}
#endif
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
/*
* The caller of this function will set is_lazy to false explicitly
* so we don't need handle this case, just skip this.
*/
#if 0
if (this_cpu_read(cpu_tlbstate.is_lazy)) {
/*
* We're in lazy mode. We need to at least flush our
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
* This should be rare, with native_flush_tlb_others skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
}
#endif
if (unlikely(local_tlb_gen == mm_tlb_gen)) {
/*
* There's nothing to do: we're already up to date. This can
* happen if two concurrent flushes happen -- the first flush to
* be handled can catch us all the way up, leaving no work for
* the second flush.
*/
// trace_tlb_flush(reason, 0);
return;
}
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
/*
* If we get to this point, we know that our TLB is out of date.
* This does not strictly imply that we need to flush (it's
* possible that f->new_tlb_gen <= local_tlb_gen), but we're
* going to need to flush in the very near future, so we might
* as well get it over with.
*
* The only question is whether to do a full or partial flush.
*
* We do a partial flush if requested and two extra conditions
* are met:
*
* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
* we've always done all needed flushes to catch up to
* local_tlb_gen. If, for example, local_tlb_gen == 2 and
* f->new_tlb_gen == 3, then we know that the flush needed to bring
* us up to date for tlb_gen 3 is the partial flush we're
* processing.
*
* As an example of why this check is needed, suppose that there
* are two concurrent flushes. The first is a full flush that
* changes context.tlb_gen from 1 to 2. The second is a partial
* flush that changes context.tlb_gen from 2 to 3. If they get
* processed on this CPU in reverse order, we'll see
* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
* If we were to use __flush_tlb_one_user() and set local_tlb_gen to
* 3, we'd be break the invariant: we'd update local_tlb_gen above
* 1 without the full flush that's needed for tlb_gen 2.
*
* 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
* Partial TLB flushes are not all that much cheaper than full TLB
* flushes, so it seems unlikely that it would be a performance win
* to do a partial flush if that won't bring our TLB fully up to
* date. By doing a full flush instead, we can increase
* local_tlb_gen all the way to mm_tlb_gen and we can probably
* avoid another flush in the very near future.
*/
if (f->end != TLB_FLUSH_ALL &&
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long addr = f->start;
while (addr < f->end) {
__flush_tlb_one_user(addr);
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
// trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
local_flush_tlb();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
// trace_tlb_flush(reason, TLB_FLUSH_ALL);
}
/* Both paths above update our state to mm_tlb_gen. */
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
}
static void copied_flush_tlb_func_remote(void *info)
{
const struct flush_tlb_info *f = info;
bool saved_lazy;
inc_irq_stat(irq_tlb_count);
if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
return;
saved_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
this_cpu_write(cpu_tlbstate.is_lazy, false);
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
copied_flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
this_cpu_write(cpu_tlbstate.is_lazy, saved_lazy);
}
static void copied_native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
#if 0
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
else
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
(info->end - info->start) >> PAGE_SHIFT);
#endif
/*
* Use non-UV system way in first version to reduce porting affort,
* we will support UV system later if necessary
*/
#if 0
if (is_uv_system()) {
/*
* This whole special case is confused. UV has a "Broadcast
* Assist Unit", which seems to be a fancy way to send IPIs.
* Back when x86 used an explicit TLB flush IPI, UV was
* optimized to use its own mechanism. These days, x86 uses
* smp_call_function_many(), but UV still uses a manual IPI,
* and that IPI's action is out of date -- it does a manual
* flush instead of calling flush_tlb_func_remote(). This
* means that the percpu tlb_gen variables won't be updated
* and we'll do pointless flushes on future context switches.
*
* Rather than hooking native_flush_tlb_others() here, I think
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
unsigned int cpu;
cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
smp_call_function_many(cpumask, copied_flush_tlb_func_remote,
(void *)info, 1);
return;
}
#endif
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves
* at the next context switch.
*
* However, if page tables are getting freed, we need to send the
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
if (info->freed_tables)
smp_call_function_many(cpumask, copied_flush_tlb_func_remote,
(void *)info, 1);
else
on_each_cpu_cond_mask(copied_tlb_is_not_lazy, copied_flush_tlb_func_remote,
(void *)info, 1, GFP_ATOMIC, cpumask);
}
void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
int cpu;
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
};
cpu = get_cpu();
/* This is also a barrier that synchronizes with switch_mm(). */
info.new_tlb_gen = inc_mm_tlb_gen(mm);
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
((end - start) >> stride_shift) <= copied_tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {
info.start = 0UL;
info.end = TLB_FLUSH_ALL;
}
/* This should never happend in our case */
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
VM_WARN_ON(irqs_disabled());
local_irq_disable();
copied_flush_tlb_func_common(&info, true, TLB_LOCAL_MM_SHOOTDOWN);
local_irq_enable();
}
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
copied_native_flush_tlb_others(mm_cpumask(mm), &info);
put_cpu();
}
#ifndef _TLB_FLUSH_H
#define _TLB_FLUSH_H
#include <asm/tlbflush.h>
void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
bool freed_tables);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册