提交 349aa245 编写于 作者: O Oran Agra 提交者: antirez

Defrag big lists in portions to avoid latency and freeze

When active defrag kicks in and finds a big list, it will create a bookmark to
a node so that it is able to resume iteration from that node later.

The quicklist manages that bookmark, and updates it in case that node is deleted.

This will increase memory usage only on lists of over 1000 (see
active-defrag-max-scan-fields) quicklist nodes (1000 ziplists, not 1000 items)
by 16 bytes.

In 32 bit build, this change reduces the maximum effective config of
list-compress-depth and list-max-ziplist-size (from 32767 to 8191)
上级 b4ddc7b7
......@@ -5,8 +5,8 @@
* We do that by scanning the keyspace and for each pointer we have, we can try to
* ask the allocator if moving it to a new address will help reduce fragmentation.
*
* Copyright (c) 2017, Oran Agra
* Copyright (c) 2017, Redis Labs, Inc
* Copyright (c) 2020, Oran Agra
* Copyright (c) 2020, Redis Labs, Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
......@@ -408,25 +408,32 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd
return NULL;
}
long activeDefragQuickListNodes(quicklist *ql) {
quicklistNode *node = ql->head, *newnode;
long activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
quicklistNode *newnode, *node = *node_ref;
long defragged = 0;
unsigned char *newzl;
if ((newnode = activeDefragAlloc(node))) {
if (newnode->prev)
newnode->prev->next = newnode;
else
ql->head = newnode;
if (newnode->next)
newnode->next->prev = newnode;
else
ql->tail = newnode;
*node_ref = node = newnode;
defragged++;
}
if ((newzl = activeDefragAlloc(node->zl)))
defragged++, node->zl = newzl;
return defragged;
}
long activeDefragQuickListNodes(quicklist *ql) {
quicklistNode *node = ql->head;
long defragged = 0;
while (node) {
if ((newnode = activeDefragAlloc(node))) {
if (newnode->prev)
newnode->prev->next = newnode;
else
ql->head = newnode;
if (newnode->next)
newnode->next->prev = newnode;
else
ql->tail = newnode;
node = newnode;
defragged++;
}
if ((newzl = activeDefragAlloc(node->zl)))
defragged++, node->zl = newzl;
defragged += activeDefragQuickListNode(ql, &node);
node = node->next;
}
return defragged;
......@@ -440,12 +447,48 @@ void defragLater(redisDb *db, dictEntry *kde) {
listAddNodeTail(db->defrag_later, key);
}
long scanLaterList(robj *ob) {
/* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */
long scanLaterList(robj *ob, unsigned long *cursor, long long endtime, long long *defragged) {
quicklist *ql = ob->ptr;
quicklistNode *node;
long iterations = 0;
int bookmark_failed = 0;
if (ob->type != OBJ_LIST || ob->encoding != OBJ_ENCODING_QUICKLIST)
return 0;
server.stat_active_defrag_scanned+=ql->len;
return activeDefragQuickListNodes(ql);
if (*cursor == 0) {
/* if cursor is 0, we start new iteration */
node = ql->head;
} else {
node = quicklistBookmarkFind(ql, "_AD");
if (!node) {
/* if the bookmark was deleted, it means we reached the end. */
*cursor = 0;
return 0;
}
node = node->next;
}
(*cursor)++;
while (node) {
(*defragged) += activeDefragQuickListNode(ql, &node);
server.stat_active_defrag_scanned++;
if (++iterations > 128 && !bookmark_failed) {
if (ustime() > endtime) {
if (!quicklistBookmarkCreate(&ql, "_AD", node)) {
bookmark_failed = 1;
} else {
ob->ptr = ql; /* bookmark creation may have re-allocated the quicklist */
return 1;
}
}
iterations = 0;
}
node = node->next;
}
quicklistBookmarkDelete(ql, "_AD");
*cursor = 0;
return bookmark_failed? 1: 0;
}
typedef struct {
......@@ -638,7 +681,8 @@ int scanLaterStraemListpacks(robj *ob, unsigned long *cursor, long long endtime,
void *newdata = activeDefragAlloc(ri.data);
if (newdata)
raxSetData(ri.node, ri.data=newdata), (*defragged)++;
if (++iterations > 16) {
server.stat_active_defrag_scanned++;
if (++iterations > 128) {
if (ustime() > endtime) {
serverAssert(ri.key_len==sizeof(last));
memcpy(last,ri.key,ri.key_len);
......@@ -900,8 +944,7 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime) {
if (de) {
robj *ob = dictGetVal(de);
if (ob->type == OBJ_LIST) {
server.stat_active_defrag_hits += scanLaterList(ob);
*cursor = 0; /* list has no scan, we must finish it in one go */
return scanLaterList(ob, cursor, endtime, &server.stat_active_defrag_hits);
} else if (ob->type == OBJ_SET) {
server.stat_active_defrag_hits += scanLaterSet(ob, cursor);
} else if (ob->type == OBJ_ZSET) {
......@@ -961,11 +1004,6 @@ int defragLaterStep(redisDb *db, long long endtime) {
if (defragLaterItem(de, &defrag_later_cursor, endtime))
quit = 1; /* time is up, we didn't finish all the work */
/* Don't start a new BIG key in this loop, this is because the
* next key can be a list, and scanLaterList must be done in once cycle */
if (!defrag_later_cursor)
quit = 1;
/* Once in 16 scan iterations, 512 pointer reallocations, or 64 fields
* (if we have a lot of pointers in one hash bucket, or rehashing),
* check if we reached the time limit. */
......
......@@ -70,6 +70,12 @@ static const size_t optimization_level[] = {4096, 8192, 16384, 32768, 65536};
} while (0);
#endif
/* Bookmarks forward declarations */
#define QL_MAX_BM ((1 << QL_BM_BITS)-1)
quicklistBookmark *_quicklistBookmarkFindByName(quicklist *ql, const char *name);
quicklistBookmark *_quicklistBookmarkFindByNode(quicklist *ql, quicklistNode *node);
void _quicklistBookmarkDelete(quicklist *ql, quicklistBookmark *bm);
/* Simple way to give quicklistEntry structs default values with one call. */
#define initEntry(e) \
do { \
......@@ -100,10 +106,11 @@ quicklist *quicklistCreate(void) {
quicklist->count = 0;
quicklist->compress = 0;
quicklist->fill = -2;
quicklist->bookmark_count = 0;
return quicklist;
}
#define COMPRESS_MAX (1 << 16)
#define COMPRESS_MAX (1 << QL_COMP_BITS)
void quicklistSetCompressDepth(quicklist *quicklist, int compress) {
if (compress > COMPRESS_MAX) {
compress = COMPRESS_MAX;
......@@ -113,7 +120,7 @@ void quicklistSetCompressDepth(quicklist *quicklist, int compress) {
quicklist->compress = compress;
}
#define FILL_MAX (1 << 15)
#define FILL_MAX (1 << (QL_FILL_BITS-1))
void quicklistSetFill(quicklist *quicklist, int fill) {
if (fill > FILL_MAX) {
fill = FILL_MAX;
......@@ -169,6 +176,7 @@ void quicklistRelease(quicklist *quicklist) {
quicklist->len--;
current = next;
}
quicklistBookmarksClear(quicklist);
zfree(quicklist);
}
......@@ -578,6 +586,15 @@ quicklist *quicklistCreateFromZiplist(int fill, int compress,
REDIS_STATIC void __quicklistDelNode(quicklist *quicklist,
quicklistNode *node) {
/* Update the bookmark if any */
quicklistBookmark *bm = _quicklistBookmarkFindByNode(quicklist, node);
if (bm) {
bm->node = node->next;
/* if the bookmark was to the last node, delete it. */
if (!bm->node)
_quicklistBookmarkDelete(quicklist, bm);
}
if (node->next)
node->next->prev = node->prev;
if (node->prev)
......@@ -1410,6 +1427,87 @@ void quicklistPush(quicklist *quicklist, void *value, const size_t sz,
}
}
/* Create or update a bookmark in the list which will be updated to the next node
* automatically when the one referenced gets deleted.
* Returns 1 on success (creation of new bookmark or override of an existing one).
* Returns 0 on failure (reached the maximum supported number of bookmarks).
* NOTE: use short simple names, so that string compare on find is quick.
* NOTE: bookmakrk creation may re-allocate the quicklist, so the input pointer
may change and it's the caller responsibilty to update the reference.
*/
int quicklistBookmarkCreate(quicklist **ql_ref, const char *name, quicklistNode *node) {
quicklist *ql = *ql_ref;
if (ql->bookmark_count >= QL_MAX_BM)
return 0;
quicklistBookmark *bm = _quicklistBookmarkFindByName(ql, name);
if (bm) {
bm->node = node;
return 1;
}
ql = zrealloc(ql, sizeof(quicklist) + (ql->bookmark_count+1) * sizeof(quicklistBookmark));
*ql_ref = ql;
ql->bookmarks[ql->bookmark_count].node = node;
ql->bookmarks[ql->bookmark_count].name = zstrdup(name);
ql->bookmark_count++;
return 1;
}
/* Find the quicklist node referenced by a named bookmark.
* When the bookmarked node is deleted the bookmark is updated to the next node,
* and if that's the last node, the bookmark is deleted (so find returns NULL). */
quicklistNode *quicklistBookmarkFind(quicklist *ql, const char *name) {
quicklistBookmark *bm = _quicklistBookmarkFindByName(ql, name);
if (!bm) return NULL;
return bm->node;
}
/* Delete a named bookmark.
* returns 0 if bookmark was not found, and 1 if deleted.
* Note that the bookmark memory is not freed yet, and is kept for future use. */
int quicklistBookmarkDelete(quicklist *ql, const char *name) {
quicklistBookmark *bm = _quicklistBookmarkFindByName(ql, name);
if (!bm)
return 0;
_quicklistBookmarkDelete(ql, bm);
return 1;
}
quicklistBookmark *_quicklistBookmarkFindByName(quicklist *ql, const char *name) {
unsigned i;
for (i=0; i<ql->bookmark_count; i++) {
if (!strcmp(ql->bookmarks[i].name, name)) {
return &ql->bookmarks[i];
}
}
return NULL;
}
quicklistBookmark *_quicklistBookmarkFindByNode(quicklist *ql, quicklistNode *node) {
unsigned i;
for (i=0; i<ql->bookmark_count; i++) {
if (ql->bookmarks[i].node == node) {
return &ql->bookmarks[i];
}
}
return NULL;
}
void _quicklistBookmarkDelete(quicklist *ql, quicklistBookmark *bm) {
int index = bm - ql->bookmarks;
zfree(bm->name);
ql->bookmark_count--;
memmove(bm, bm+1, (ql->bookmark_count - index)* sizeof(*bm));
/* NOTE: We do not shrink (realloc) the quicklist yet (to avoid resonance,
* it may be re-used later (a call to realloc may NOP). */
}
void quicklistBookmarksClear(quicklist *ql) {
while (ql->bookmark_count)
zfree(ql->bookmarks[--ql->bookmark_count].name);
/* NOTE: We do not shrink (realloc) the quick list. main use case for this
* function is just before releasing the allocation. */
}
/* The rest of this file is test cases and test helpers. */
#ifdef REDIS_TEST
#include <stdint.h>
......@@ -2641,6 +2739,54 @@ int quicklistTest(int argc, char *argv[]) {
printf("Compressions: %0.2f seconds.\n", (float)(stop - start) / 1000);
printf("\n");
TEST("bookmark get updated to next item") {
quicklist *ql = quicklistNew(1, 0);
quicklistPushTail(ql, "1", 1);
quicklistPushTail(ql, "2", 1);
quicklistPushTail(ql, "3", 1);
quicklistPushTail(ql, "4", 1);
quicklistPushTail(ql, "5", 1);
assert(ql->len==5);
/* add two bookmarks, one pointing to the node before the last. */
assert(quicklistBookmarkCreate(&ql, "_dummy", ql->head->next));
assert(quicklistBookmarkCreate(&ql, "_test", ql->tail->prev));
/* test that the bookmark returns the right node, delete it and see that the bookmark points to the last node */
assert(quicklistBookmarkFind(ql, "_test") == ql->tail->prev);
assert(quicklistDelRange(ql, -2, 1));
assert(quicklistBookmarkFind(ql, "_test") == ql->tail);
/* delete the last node, and see that the bookmark was deleted. */
assert(quicklistDelRange(ql, -1, 1));
assert(quicklistBookmarkFind(ql, "_test") == NULL);
/* test that other bookmarks aren't affected */
assert(quicklistBookmarkFind(ql, "_dummy") == ql->head->next);
assert(quicklistBookmarkFind(ql, "_missing") == NULL);
assert(ql->len==3);
quicklistBookmarksClear(ql); /* for coverage */
assert(quicklistBookmarkFind(ql, "_dummy") == NULL);
quicklistRelease(ql);
}
TEST("bookmark limit") {
int i;
quicklist *ql = quicklistNew(1, 0);
quicklistPushHead(ql, "1", 1);
for (i=0; i<QL_MAX_BM; i++)
assert(quicklistBookmarkCreate(&ql, genstr("",i), ql->head));
/* when all bookmarks are used, creation fails */
assert(!quicklistBookmarkCreate(&ql, "_test", ql->head));
/* delete one and see that we can now create another */
assert(quicklistBookmarkDelete(ql, "0"));
assert(quicklistBookmarkCreate(&ql, "_test", ql->head));
/* delete one and see that the rest survive */
assert(quicklistBookmarkDelete(ql, "_test"));
for (i=1; i<QL_MAX_BM; i++)
assert(quicklistBookmarkFind(ql, genstr("",i)) == ql->head);
/* make sure the deleted ones are indeed gone */
assert(!quicklistBookmarkFind(ql, "0"));
assert(!quicklistBookmarkFind(ql, "_test"));
quicklistRelease(ql);
}
if (!err)
printf("ALL TESTS PASSED!\n");
else
......
......@@ -28,6 +28,8 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdint.h> // for UINTPTR_MAX
#ifndef __QUICKLIST_H__
#define __QUICKLIST_H__
......@@ -64,19 +66,51 @@ typedef struct quicklistLZF {
char compressed[];
} quicklistLZF;
/* Bookmarks are padded with realloc at the end of of the quicklist struct.
* They should only be used for very big lists if thousands of nodes were the
* excess memory usage is negligible, and there's a real need to iterate on them
* in portions.
* When not used, they don't add any memory overhead, but when used and then
* deleted, some overhead remains (to avoid resonance).
* The number of bookmarks used should be kept to minimum since it also adds
* overhead on node deletion (searching for a bookmark to update). */
typedef struct quicklistBookmark {
quicklistNode *node;
char *name;
} quicklistBookmark;
#if UINTPTR_MAX == 0xffffffff
/* 32-bit */
# define QL_FILL_BITS 14
# define QL_COMP_BITS 14
# define QL_BM_BITS 4
#elif UINTPTR_MAX == 0xffffffffffffffff
/* 64-bit */
# define QL_FILL_BITS 16
# define QL_COMP_BITS 16
# define QL_BM_BITS 4 /* we can encode more, but we rather limit the user
since they cause performance degradation. */
#else
# error unknown arch bits count
#endif
/* quicklist is a 40 byte struct (on 64-bit systems) describing a quicklist.
* 'count' is the number of total entries.
* 'len' is the number of quicklist nodes.
* 'compress' is: -1 if compression disabled, otherwise it's the number
* of quicklistNodes to leave uncompressed at ends of quicklist.
* 'fill' is the user-requested (or default) fill factor. */
* 'fill' is the user-requested (or default) fill factor.
* 'bookmakrs are an optional feature that is used by realloc this struct,
* so that they don't consume memory when not used. */
typedef struct quicklist {
quicklistNode *head;
quicklistNode *tail;
unsigned long count; /* total count of all entries in all ziplists */
unsigned long len; /* number of quicklistNodes */
int fill : 16; /* fill factor for individual nodes */
unsigned int compress : 16; /* depth of end nodes not to compress;0=off */
int fill : QL_FILL_BITS; /* fill factor for individual nodes */
unsigned int compress : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */
unsigned int bookmark_count: QL_BM_BITS;
quicklistBookmark bookmarks[];
} quicklist;
typedef struct quicklistIter {
......@@ -158,6 +192,12 @@ unsigned long quicklistCount(const quicklist *ql);
int quicklistCompare(unsigned char *p1, unsigned char *p2, int p2_len);
size_t quicklistGetLzf(const quicklistNode *node, void **data);
/* bookmarks */
int quicklistBookmarkCreate(quicklist **ql_ref, const char *name, quicklistNode *node);
int quicklistBookmarkDelete(quicklist *ql, const char *name);
quicklistNode *quicklistBookmarkFind(quicklist *ql, const char *name);
void quicklistBookmarksClear(quicklist *ql);
#ifdef REDIS_TEST
int quicklistTest(int argc, char *argv[]);
#endif
......
......@@ -209,5 +209,97 @@ start_server {tags {"defrag"}} {
assert {$digest eq $newdigest}
r save ;# saving an rdb iterates over all the data / pointers
} {OK}
test "Active defrag big list" {
r flushdb
r config resetstat
r config set save "" ;# prevent bgsave from interfereing with save below
r config set hz 100
r config set activedefrag no
r config set active-defrag-max-scan-fields 1000
r config set active-defrag-threshold-lower 5
r config set active-defrag-cycle-min 65
r config set active-defrag-cycle-max 75
r config set active-defrag-ignore-bytes 2mb
r config set maxmemory 0
r config set list-max-ziplist-size 5 ;# list of 500k items will have 100k quicklist nodes
# create big keys with 10k items
set rd [redis_deferring_client]
set expected_frag 1.7
# add a mass of list nodes to two lists (allocations are interlaced)
set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation
for {set j 0} {$j < 500000} {incr j} {
$rd lpush biglist1 $val
$rd lpush biglist2 $val
}
for {set j 0} {$j < 500000} {incr j} {
$rd read ; # Discard replies
$rd read ; # Discard replies
}
# create some fragmentation
r del biglist2
# start defrag
after 120 ;# serverCron only updates the info once in 100ms
set frag [s allocator_frag_ratio]
if {$::verbose} {
puts "frag $frag"
}
assert {$frag >= $expected_frag}
r config set latency-monitor-threshold 5
r latency reset
set digest [r debug digest]
catch {r config set activedefrag yes} e
if {![string match {DISABLED*} $e]} {
# wait for the active defrag to start working (decision once a second)
wait_for_condition 50 100 {
[s active_defrag_running] ne 0
} else {
fail "defrag not started."
}
# wait for the active defrag to stop working
wait_for_condition 500 100 {
[s active_defrag_running] eq 0
} else {
after 120 ;# serverCron only updates the info once in 100ms
puts [r info memory]
puts [r info stats]
puts [r memory malloc-stats]
fail "defrag didn't stop."
}
# test the the fragmentation is lower
after 120 ;# serverCron only updates the info once in 100ms
set frag [s allocator_frag_ratio]
set max_latency 0
foreach event [r latency latest] {
lassign $event eventname time latency max
if {$eventname == "active-defrag-cycle"} {
set max_latency $max
}
}
if {$::verbose} {
puts "frag $frag"
puts "max latency $max_latency"
puts [r latency latest]
puts [r latency history active-defrag-cycle]
}
assert {$frag < 1.1}
# due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
# we expect max latency to be not much higher than 7.5ms
assert {$max_latency <= 12}
}
# verify the data isn't corrupted or changed
set newdigest [r debug digest]
assert {$digest eq $newdigest}
r save ;# saving an rdb iterates over all the data / pointers
r del biglist1 ;# coverage for quicklistBookmarksClear
} {1}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册