提交 1087d6c6 编写于 作者: O openeuler-ci-bot 提交者: Gitee
obj-m := kvm_ept_idle.o
kvm_ept_idle-y := ept_idle.o ept_idle_native_pagewalk.o tlb_flush.o
include Kbuild
# KERNEL_SRC_DIR=/lib/modules/$(shell uname -r)/build
# MODULE_DIR=$(shell pwd)
default:
$(MAKE) -C $(KERNEL_SRC_DIR) M=$(MODULE_DIR) modules
clean:
rm -f *.o *.ko *.mod.c modules.order Module.symvers
# memory-scan
#### Description
A kernel module for scaning page table of process/VMs
#### Software Architecture
Software architecture description
memory-scan is derived from memory-optimizer which is located in https://github.com/intel/memory-optimizer/tree/master/kernel_module. memory-scan is a kernel module for scaning page table of process/VMs.
#### Installation
1. xxxx
2. xxxx
3. xxxx
#### Instructions
1. xxxx
2. xxxx
3. xxxx
OpenEuler users can use memory-scan:
1. make
2. insmod memory_scan.ko
#### Contribution
......
# memory-scan
#### 介绍
A kernel module for scaning page table of process/VMs
#### 软件架构
软件架构说明
memory-scan源于memory-optimizer(https://github.com/intel/memory-optimizer/tree/master/kernel_module)。memory-scan是一个用于扫描进程或虚拟机的内存页面访问情况的内核模块。
#### 安装教程
1. xxxx
2. xxxx
3. xxxx
#### 使用说明
1. xxxx
2. xxxx
3. xxxx
OpenEuler用户可以通过以下方法使用memory-scan:
1. make
2. insmod memory_scan.ko
#### 参与贡献
......
此差异已折叠。
#ifndef _EPT_IDLE_H
#define _EPT_IDLE_H
#include "ept_idle_common.h"
#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */
#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */
#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */
enum ProcIdlePageType {
PTE_ACCESSED, /* 4k page */
PMD_ACCESSED, /* 2M page */
PUD_PRESENT, /* 1G page */
PTE_DIRTY,
PMD_DIRTY,
PTE_IDLE,
PMD_IDLE,
PMD_IDLE_PTES, /* all PTE idle */
PTE_HOLE,
PMD_HOLE,
PIP_CMD,
IDLE_PAGE_TYPE_MAX
};
#define PIP_TYPE(a) (0xf & (a >> 4))
#define PIP_SIZE(a) (0xf & a)
#define PIP_COMPOSE(type, nr) ((type << 4) | nr)
#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0)
#define _PAGE_BIT_EPT_ACCESSED 8
#define _PAGE_BIT_EPT_DIRTY 9
#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED)
#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY)
#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7))
static inline int ept_pte_present(pte_t a)
{
return pte_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pmd_present(pmd_t a)
{
return pmd_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pud_present(pud_t a)
{
return pud_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_p4d_present(p4d_t a)
{
return p4d_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pgd_present(pgd_t a)
{
return pgd_flags(a) & _PAGE_EPT_PRESENT;
}
static inline int ept_pte_accessed(pte_t a)
{
return pte_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_pmd_accessed(pmd_t a)
{
return pmd_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_pud_accessed(pud_t a)
{
return pud_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_p4d_accessed(p4d_t a)
{
return p4d_flags(a) & _PAGE_EPT_ACCESSED;
}
static inline int ept_pgd_accessed(pgd_t a)
{
return pgd_flags(a) & _PAGE_EPT_ACCESSED;
}
extern struct file_operations proc_ept_idle_operations;
#define EPT_IDLE_KBUF_FULL 1
#define EPT_IDLE_BUF_FULL 2
#define EPT_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3)
#define EPT_IDLE_KBUF_SIZE 8000
#define IDLE_PAGE_SET_PID _IOW(0x1, 0x1, pid_t)
struct ept_idle_ctrl {
struct mm_struct *mm;
struct kvm *kvm;
uint8_t kpie[EPT_IDLE_KBUF_SIZE];
int pie_read;
int pie_read_max;
void __user *buf;
int buf_size;
int bytes_copied;
unsigned long next_hva; /* GPA for EPT; VA for PT */
unsigned long gpa_to_hva;
unsigned long restart_gpa;
unsigned long last_va;
unsigned int flags;
};
#endif
// SPDX-License-Identifier: GPL-2.0
#ifndef _EPT_IDLE_COMMON_H
#define _EPT_IDLE_COMMON_H
/* Fix leak of 5 level paging supporting on old kernel*/
#ifndef CONFIG_PGTABLE_LEVELS
#define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
#else
#if CONFIG_PGTABLE_LEVELS < 4
#define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
#endif // #if CONFIG_PGTABLE_LEVELS < 4
#endif // #ifndef CONFIG_PGTABLE_LEVELS
#ifdef EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
#define p4d_t pgd_t
#define p4d_flags pgd_flags
#define p4d_offset(pgd, start) (pgd)
#define p4d_addr_end(addr, end) (end)
#define p4d_present(p4d) 1
#define p4d_ERROR(p4d) do { } while(0)
#define p4d_clear pgd_clear
#define p4d_none(p4d) 0
#define p4d_bad(p4d) 0
#define p4d_clear_bad pgd_clear_bad
#endif
#ifndef pgd_offset_pgd
#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
#endif
#endif
// SPDX-License-Identifier: GPL-2.0
// Copied from kernel mm/pagewalk.c, modified by yuan.yao@intel.com
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
#include "ept_idle_common.h"
#ifdef CONFIG_HUGETLB_PAGE
int pmd_huge(pmd_t pmd)
{
return !pmd_none(pmd) &&
(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
}
int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
/*
* ept_idle_huge_pte_offset() - Walk the page table to resolve the hugepage
* entry at address @addr
*
* Return: Pointer to page table or swap entry (PUD or PMD) for
* address @addr, or NULL if a p*d_none() entry is encountered and the
* size @sz doesn't match the hugepage size at this level of the page
* table.
*/
pte_t *ept_idle_huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
return NULL;
p4d = p4d_offset(pgd, addr);
if (!p4d_present(*p4d))
return NULL;
pud = pud_offset(p4d, addr);
if (sz != PUD_SIZE && pud_none(*pud))
return NULL;
/* hugepage or swap? */
if (pud_huge(*pud) || !pud_present(*pud))
return (pte_t *)pud;
pmd = pmd_offset(pud, addr);
if (sz != PMD_SIZE && pmd_none(*pmd))
return NULL;
/* hugepage or swap? */
if (pmd_huge(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
return NULL;
}
#else // #ifdef CONFIG_HUGETLB_PAGE
#define pud_huge(x) 0
#define pmd_huge(x) 0
#define ept_idle_huge_pte_offset(mm, address, sz) 0
#endif
#ifndef VM_BUG_ON_VMA
#define VM_BUG_ON_VMA(cond, vma) \
do { \
if (unlikely(cond)) { \
BUG(); \
} \
} while (0)
#endif
#ifndef VM_BUG_ON_MM
#define VM_BUG_ON_MM VM_BUG_ON_VMA
#endif
static inline int ept_idle_p4d_none_or_clear_bad(p4d_t *p4d)
{
if (p4d_none(*p4d))
return 1;
if (unlikely(p4d_bad(*p4d))) {
p4d_clear_bad(p4d);
return 1;
}
return 0;
}
static inline spinlock_t *ept_idle_pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
spinlock_t *ptl;
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
ptl = pud_lock(vma->vm_mm, pud);
if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
return ptl;
spin_unlock(ptl);
return NULL;
}
void p4d_clear_bad(p4d_t *p4d)
{
p4d_ERROR(*p4d);
p4d_clear(p4d);
}
void pmd_clear_bad(pmd_t *pmd)
{
pmd_ERROR(*pmd);
pmd_clear(pmd);
}
#ifdef _EPT_IDLE_SPLIT_PMD_
static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pte_t *pte;
int err = 0;
pte = pte_offset_map(pmd, addr);
for (;;) {
err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
addr += PAGE_SIZE;
if (addr == end)
break;
pte++;
}
pte_unmap(pte);
return err;
}
#endif
static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pmd_t *pmd;
unsigned long next;
int err = 0;
pmd = pmd_offset(pud, addr);
do {
#ifdef _EPT_IDLE_SPLIT_PMD_
again:
#endif
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd) || !walk->vma) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
/*
* This implies that each ->pmd_entry() handler
* needs to know about pmd_trans_huge() pmds
*/
if (walk->pmd_entry)
err = walk->pmd_entry(pmd, addr, next, walk);
if (err)
break;
#ifdef _EPT_IDLE_SPLIT_PMD_
/*
* Check this here so we only break down trans_huge
* pages when we _need_ to
*/
if (!walk->pte_entry)
continue;
split_huge_pmd(walk->vma, pmd, addr);
if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
#endif
} while (pmd++, addr = next, addr != end);
return err;
}
static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pud_t *pud;
unsigned long next;
int err = 0;
pud = pud_offset(p4d, addr);
do {
#ifdef _EPT_IDLE_SPLIT_PUD_
again:
#endif
next = pud_addr_end(addr, end);
if (pud_none(*pud) || !walk->vma) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pud_entry) {
spinlock_t *ptl = ept_idle_pud_trans_huge_lock(pud, walk->vma);
if (ptl) {
err = walk->pud_entry(pud, addr, next, walk);
spin_unlock(ptl);
if (err)
break;
continue;
}
}
#ifdef _EPT_IDLE_SPLIT_PUD_
split_huge_pud(walk->vma, pud, addr);
if (pud_none(*pud))
goto again;
#endif
if (walk->pmd_entry || walk->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
return err;
}
static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
p4d_t *p4d;
unsigned long next;
int err = 0;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (ept_idle_p4d_none_or_clear_bad(p4d)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pmd_entry || walk->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
} while (p4d++, addr = next, addr != end);
return err;
}
static int walk_pgd_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pgd_t *pgd;
unsigned long next;
int err = 0;
pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pmd_entry || walk->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
} while (pgd++, addr = next, addr != end);
return err;
}
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
unsigned long end)
{
unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
return boundary < end ? boundary : end;
}
static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct hstate *h = hstate_vma(vma);
unsigned long next;
unsigned long hmask = huge_page_mask(h);
unsigned long sz = huge_page_size(h);
pte_t *pte;
int err = 0;
do {
next = hugetlb_entry_end(h, addr, end);
pte = ept_idle_huge_pte_offset(walk->mm, addr & hmask, sz);
if (pte)
err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
else if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
} while (addr = next, addr != end);
return err;
}
#else /* CONFIG_HUGETLB_PAGE */
static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
return 0;
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
* Decide whether we really walk over the current vma on [@start, @end)
* or skip it via the returned value. Return 0 if we do walk over the
* current vma, and return 1 if we skip the vma. Negative values means
* error, where we abort the current walk.
*/
static int walk_page_test(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
if (walk->test_walk)
return walk->test_walk(start, end, walk);
/*
* vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
* range, so we don't walk over it as we do for normal vmas. However,
* Some callers are interested in handling hole range and they don't
* want to just ignore any single address range. Such users certainly
* define their ->pte_hole() callbacks, so let's delegate them to handle
* vma(VM_PFNMAP).
*/
if (vma->vm_flags & VM_PFNMAP) {
int err = 1;
if (walk->pte_hole)
err = walk->pte_hole(start, end, walk);
return err ? err : 1;
}
return 0;
}
static int __walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
int err = 0;
struct vm_area_struct *vma = walk->vma;
if (vma && is_vm_hugetlb_page(vma)) {
if (walk->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
return err;
}
/**
* walk_page_range - walk page table with caller specific callbacks
* @start: start address of the virtual address range
* @end: end address of the virtual address range
* @walk: mm_walk structure defining the callbacks and the target address space
*
* Recursively walk the page table tree of the process represented by @walk->mm
* within the virtual address range [@start, @end). During walking, we can do
* some caller-specific works for each entry, by setting up pmd_entry(),
* pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
* callbacks, the associated entries/pages are just ignored.
* The return values of these callbacks are commonly defined like below:
*
* - 0 : succeeded to handle the current entry, and if you don't reach the
* end address yet, continue to walk.
* - >0 : succeeded to handle the current entry, and return to the caller
* with caller specific value.
* - <0 : failed to handle the current entry, and return to the caller
* with error code.
*
* Before starting to walk page table, some callers want to check whether
* they really want to walk over the current vma, typically by checking
* its vm_flags. walk_page_test() and @walk->test_walk() are used for this
* purpose.
*
* struct mm_walk keeps current values of some common data like vma and pmd,
* which are useful for the access from callbacks. If you want to pass some
* caller-specific data to callbacks, @walk->private should be helpful.
*
* Locking:
* Callers of walk_page_range() and walk_page_vma() should hold
* @walk->mm->mmap_sem, because these function traverse vma list and/or
* access to vma's data.
*/
int ept_idle_walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
int err = 0;
unsigned long next;
struct vm_area_struct *vma;
if<