From 491d57abaae7ddb9f32951acc3c38e27f3638b90 Mon Sep 17 00:00:00 2001
From: Matt Stancliff <matt@genges.com>
Date: Tue, 13 Jan 2015 11:15:30 -0500
Subject: [PATCH 01/11] Add --track-origins=yes to valgrind

---
 tests/instances.tcl      | 4 ++--
 tests/support/server.tcl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/instances.tcl b/tests/instances.tcl
index b9eb42258..4e2f33dfc 100644
--- a/tests/instances.tcl
+++ b/tests/instances.tcl
@@ -68,7 +68,7 @@ proc spawn_instance {type base_port count {conf {}}} {
         }
 
         if {$::valgrind} {
-            set pid [exec valgrind --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
+            set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
         } else {
             set pid [exec ../../../src/${prgname} $cfgfile &]
         }
@@ -401,7 +401,7 @@ proc restart_instance {type id} {
     }
 
     if {$::valgrind} {
-        set pid [exec valgrind --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
+        set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
     } else {
         set pid [exec ../../../src/${prgname} $cfgfile &]
     }
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 67ee24528..317b40a84 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -207,7 +207,7 @@ proc start_server {options {code undefined}} {
     set stderr [format "%s/%s" [dict get $config "dir"] "stderr"]
 
     if {$::valgrind} {
-        set pid [exec valgrind --suppressions=src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full src/redis-server $config_file > $stdout 2> $stderr &]
+        set pid [exec valgrind --track-origins=yes --suppressions=src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full src/redis-server $config_file > $stdout 2> $stderr &]
     } else {
         set pid [exec src/redis-server $config_file > $stdout 2> $stderr &]
     }

From 59ad6ac5feac4f6760144861b723a51383a0f19b Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Wed, 21 Jan 2015 15:55:53 +0100
Subject: [PATCH 02/11] Cluster: set the slaves->slaveof filed to NULL when
 master is freed.

Related to issue #2289.
---
 src/cluster.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/cluster.c b/src/cluster.c
index ec6901e8f..5135cdaa5 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -821,6 +821,14 @@ int clusterCountNonFailingSlaves(clusterNode *n) {
 
 void freeClusterNode(clusterNode *n) {
     sds nodename;
+    int j;
+
+    /* If the node is a master with associated slaves, we have to set
+     * all the slaves->slaveof fields to NULL (unknown). */
+    if (nodeIsMaster(n)) {
+        for (j = 0; j < n->numslaves; j++)
+            n->slaves[j]->slaveof = NULL;
+    }
 
     nodename = sdsnewlen(n->name, REDIS_CLUSTER_NAMELEN);
     redisAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK);

From 2601e3e4614800552a3ff5c91b4ac475a664df5a Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Wed, 21 Jan 2015 16:03:43 +0100
Subject: [PATCH 03/11] Cluster: node deletion cleanup / centralization.

---
 src/cluster.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index 5135cdaa5..01d6a32e8 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -819,6 +819,7 @@ int clusterCountNonFailingSlaves(clusterNode *n) {
     return okslaves;
 }
 
+/* Low level cleanup of the node structure. Only called by clusterDelNode(). */
 void freeClusterNode(clusterNode *n) {
     sds nodename;
     int j;
@@ -830,10 +831,15 @@ void freeClusterNode(clusterNode *n) {
             n->slaves[j]->slaveof = NULL;
     }
 
+    /* Remove this node from the list of slaves of its master. */
+    if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n);
+
+    /* Unlink from the set of nodes. */
     nodename = sdsnewlen(n->name, REDIS_CLUSTER_NAMELEN);
     redisAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK);
     sdsfree(nodename);
-    if (n->slaveof) clusterNodeRemoveSlave(n->slaveof, n);
+
+    /* Release link and associated data structures. */
     if (n->link) freeClusterLink(n->link);
     listRelease(n->fail_reports);
     zfree(n);
@@ -848,11 +854,16 @@ int clusterAddNode(clusterNode *node) {
     return (retval == DICT_OK) ? REDIS_OK : REDIS_ERR;
 }
 
-/* Remove a node from the cluster:
- * 1) Mark all the nodes handled by it as unassigned.
- * 2) Remove all the failure reports sent by this node.
- * 3) Free the node, that will in turn remove it from the hash table
- *    and from the list of slaves of its master, if it is a slave node.
+/* Remove a node from the cluster. The functio performs the high level
+ * cleanup, calling freeClusterNode() for the low level cleanup.
+ * Here we do the following:
+ *
+ * 1) Mark all the slots handled by it as unassigned.
+ * 2) Remove all the failure reports sent by this node and referenced by
+ *    other nodes.
+ * 3) Free the node with freeClusterNode() that will in turn remove it
+ *    from the hash table and from the list of slaves of its master, if
+ *    it is a slave node.
  */
 void clusterDelNode(clusterNode *delnode) {
     int j;
@@ -879,11 +890,7 @@ void clusterDelNode(clusterNode *delnode) {
     }
     dictReleaseIterator(di);
 
-    /* 3) Remove this node from its master's slaves if needed. */
-    if (nodeIsSlave(delnode) && delnode->slaveof)
-        clusterNodeRemoveSlave(delnode->slaveof,delnode);
-
-    /* 4) Free the node, unlinking it from the cluster. */
+    /* 3) Free the node, unlinking it from the cluster. */
     freeClusterNode(delnode);
 }
 
@@ -1619,7 +1626,7 @@ int clusterProcessPacket(clusterLink *link) {
                     }
                     /* Free this node as we already have it. This will
                      * cause the link to be freed as well. */
-                    freeClusterNode(link->node);
+                    clusterDelNode(link->node);
                     return 0;
                 }
 
@@ -2913,7 +2920,7 @@ void clusterCron(void) {
         /* A Node in HANDSHAKE state has a limited lifespan equal to the
          * configured node timeout. */
         if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) {
-            freeClusterNode(node);
+            clusterDelNode(node);
             continue;
         }
 

From a5bb0a0774fbe89d72de13624b23079031674932 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Wed, 21 Jan 2015 16:13:30 +0100
Subject: [PATCH 04/11] Cluster/Sentinel test: pause on exceptions as well.

---
 tests/cluster/run.tcl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/cluster/run.tcl b/tests/cluster/run.tcl
index 69a160c4f..f764cea0a 100644
--- a/tests/cluster/run.tcl
+++ b/tests/cluster/run.tcl
@@ -21,6 +21,7 @@ proc main {} {
 
 if {[catch main e]} {
     puts $::errorInfo
+    if {$::pause_on_error} pause_on_error
     cleanup
     exit 1
 }

From b3bf7584b0aa5c2dbc1acf4d7f6b2c3d420e8e42 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Wed, 21 Jan 2015 16:18:34 +0100
Subject: [PATCH 05/11] Cluster/Sentinel test: also pause on
 abort_sentinel_test call.

---
 tests/instances.tcl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/instances.tcl b/tests/instances.tcl
index 4e2f33dfc..a68b79d11 100644
--- a/tests/instances.tcl
+++ b/tests/instances.tcl
@@ -105,6 +105,7 @@ proc cleanup {} {
 proc abort_sentinel_test msg {
     puts "WARNING: Aborting the test."
     puts ">>>>>>>> $msg"
+    if {$::pause_on_error} pause_on_error
     cleanup
     exit 1
 }

From acb1d8debf23f3dbd9199d1276a86ada71750196 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Wed, 21 Jan 2015 16:46:51 +0100
Subject: [PATCH 06/11] Cluster test: wait for port to unbound in
 kill_instance.

Otherwise kill_instance + restart_instance in short succession will
still find the port busy and will fail.
---
 tests/instances.tcl | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/instances.tcl b/tests/instances.tcl
index a68b79d11..7d87cdf59 100644
--- a/tests/instances.tcl
+++ b/tests/instances.tcl
@@ -370,15 +370,31 @@ proc get_instance_id_by_port {type port} {
 # The instance can be restarted with restart-instance.
 proc kill_instance {type id} {
     set pid [get_instance_attrib $type $id pid]
+    set port [get_instance_attrib $type $id port]
+
     if {$pid == -1} {
         error "You tried to kill $type $id twice."
     }
+
     exec kill -9 $pid
     set_instance_attrib $type $id pid -1
     set_instance_attrib $type $id link you_tried_to_talk_with_killed_instance
 
     # Remove the PID from the list of pids to kill at exit.
     set ::pids [lsearch -all -inline -not -exact $::pids $pid]
+
+    # Wait for the port it was using to be available again, so that's not
+    # an issue to start a new server ASAP with the same port.
+    set retry 10
+    while {[incr retry -1]} {
+        set port_is_free [catch {set s [socket 127.0.01 $port]}]
+        if {$port_is_free} break
+        catch {close $s}
+        after 1000
+    }
+    if {$retry == 0} {
+        error "Port $port does not return available after killing instance."
+    }
 }
 
 # Return true of the instance of the specified type/id is killed.

From 72b8574cca7480f8d4a318727c6dacad891733d6 Mon Sep 17 00:00:00 2001
From: Matt Stancliff <matt@genges.com>
Date: Wed, 14 Jan 2015 11:21:50 -0500
Subject: [PATCH 07/11] Fix sending uninitialized bytes

Fixes valgrind error:
Syscall param write(buf) points to uninitialised byte(s)
   at 0x514C35D: ??? (syscall-template.S:81)
   by 0x456B81: clusterWriteHandler (cluster.c:1907)
   by 0x41D596: aeProcessEvents (ae.c:416)
   by 0x41D8EA: aeMain (ae.c:455)
   by 0x41A84B: main (redis.c:3832)
 Address 0x5f268e2 is 2,274 bytes inside a block of size 8,192 alloc'd
   at 0x4932D1: je_realloc (jemalloc.c:1297)
   by 0x428185: zrealloc (zmalloc.c:162)
   by 0x4269E0: sdsMakeRoomFor.part.0 (sds.c:142)
   by 0x426CD7: sdscatlen (sds.c:251)
   by 0x4579E7: clusterSendMessage (cluster.c:1995)
   by 0x45805A: clusterSendPing (cluster.c:2140)
   by 0x45BB03: clusterCron (cluster.c:2944)
   by 0x423344: serverCron (redis.c:1239)
   by 0x41D6CD: aeProcessEvents (ae.c:311)
   by 0x41D8EA: aeMain (ae.c:455)
   by 0x41A84B: main (redis.c:3832)
 Uninitialised value was created by a stack allocation
   at 0x457810: nodeUpdateAddressIfNeeded (cluster.c:1236)
---
 src/cluster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster.c b/src/cluster.c
index 01d6a32e8..328dc2c85 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -1249,7 +1249,7 @@ void nodeIp2String(char *buf, clusterLink *link) {
  * The function returns 0 if the node address is still the same,
  * otherwise 1 is returned. */
 int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) {
-    char ip[REDIS_IP_STR_LEN];
+    char ip[REDIS_IP_STR_LEN] = {0};
 
     /* We don't proceed if the link is the same as the sender link, as this
      * function is designed to see if the node link is consistent with the

From 30152554eab1d5fa3850ad6ad372aeb3dc1ebacf Mon Sep 17 00:00:00 2001
From: Matt Stancliff <matt@genges.com>
Date: Wed, 14 Jan 2015 11:31:17 -0500
Subject: [PATCH 08/11] Fix cluster reset memory leak

[maybe] Fixes valgrind errors:
32 bytes in 4 blocks are definitely lost in loss record 107 of 228
   at 0x80EA447: je_malloc (jemalloc.c:944)
   by 0x806E59C: zrealloc (zmalloc.c:125)
   by 0x80A9AFC: clusterSetMaster (cluster.c:801)
   by 0x80AEDC9: clusterCommand (cluster.c:3994)
   by 0x80682A5: call (redis.c:2049)
   by 0x8068A20: processCommand (redis.c:2309)
   by 0x8076497: processInputBuffer (networking.c:1143)
   by 0x8073BAF: readQueryFromClient (networking.c:1208)
   by 0x8060E98: aeProcessEvents (ae.c:412)
   by 0x806123B: aeMain (ae.c:455)
   by 0x806C3DB: main (redis.c:3832)

64 bytes in 8 blocks are definitely lost in loss record 143 of 228
   at 0x80EA447: je_malloc (jemalloc.c:944)
   by 0x806E59C: zrealloc (zmalloc.c:125)
   by 0x80AAB40: clusterProcessPacket (cluster.c:801)
   by 0x80A847F: clusterReadHandler (cluster.c:1975)
   by 0x30000FF: ???

80 bytes in 10 blocks are definitely lost in loss record 148 of 228
   at 0x80EA447: je_malloc (jemalloc.c:944)
   by 0x806E59C: zrealloc (zmalloc.c:125)
   by 0x80AAB40: clusterProcessPacket (cluster.c:801)
   by 0x80A847F: clusterReadHandler (cluster.c:1975)
   by 0x2FFFFFF: ???
---
 src/cluster.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cluster.c b/src/cluster.c
index 328dc2c85..71b17c977 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -842,6 +842,7 @@ void freeClusterNode(clusterNode *n) {
     /* Release link and associated data structures. */
     if (n->link) freeClusterLink(n->link);
     listRelease(n->fail_reports);
+    zfree(n->slaves);
     zfree(n);
 }
 

From 29049507ec34efd59ce6de7cff524fb44b47f934 Mon Sep 17 00:00:00 2001
From: Matt Stancliff <matt@genges.com>
Date: Wed, 14 Jan 2015 11:10:25 -0500
Subject: [PATCH 09/11] Fix potential invalid read past end of array

If array has N elements, we can't read +1 if we are already at N.

Also, we need to move elements by their storage size in the array,
not just by individual bytes.
---
 src/cluster.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index 71b17c977..ba84b3a91 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -783,8 +783,11 @@ int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) {
 
     for (j = 0; j < master->numslaves; j++) {
         if (master->slaves[j] == slave) {
-            memmove(master->slaves+j,master->slaves+(j+1),
-                (master->numslaves-1)-j);
+            if ((j+1) < master->numslaves) {
+                int remaining_slaves = (master->numslaves - j) - 1;
+                memmove(master->slaves+j,master->slaves+(j+1),
+                        (sizeof(*master->slaves) * remaining_slaves));
+            }
             master->numslaves--;
             return REDIS_OK;
         }

From 051a43e03a4db665b3bf6e8b45790298c86a96af Mon Sep 17 00:00:00 2001
From: Matt Stancliff <matt@genges.com>
Date: Thu, 15 Jan 2015 14:20:59 -0500
Subject: [PATCH 10/11] Fix cluster migrate memory leak

Fixes valgrind error:
48 bytes in 1 blocks are definitely lost in loss record 196 of 373
   at 0x4910D3: je_malloc (jemalloc.c:944)
   by 0x42807D: zmalloc (zmalloc.c:125)
   by 0x41FA0D: dictGetIterator (dict.c:543)
   by 0x41FA48: dictGetSafeIterator (dict.c:555)
   by 0x459B73: clusterHandleSlaveMigration (cluster.c:2776)
   by 0x45BF27: clusterCron (cluster.c:3123)
   by 0x423344: serverCron (redis.c:1239)
   by 0x41D6CD: aeProcessEvents (ae.c:311)
   by 0x41D8EA: aeMain (ae.c:455)
   by 0x41A84B: main (redis.c:3832)
---
 src/cluster.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cluster.c b/src/cluster.c
index ba84b3a91..3381d98c8 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -2803,6 +2803,7 @@ void clusterHandleSlaveMigration(int max_slaves) {
             }
         }
     }
+    dictReleaseIterator(di);
 
     /* Step 4: perform the migration if there is a target, and if I'm the
      * candidate. */

From 7e79b3f51a06ced3c13a9b8fe756e322705d5ca9 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Wed, 21 Jan 2015 18:48:08 +0100
Subject: [PATCH 11/11] Cluster test initialization: use transaction for reset
 + set-config-epoch.

Otherwise between the two commands other nodes may contact us making the
next SET-CONFIG-EPOCH call impossible.
---
 tests/cluster/tests/includes/init-tests.tcl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/cluster/tests/includes/init-tests.tcl b/tests/cluster/tests/includes/init-tests.tcl
index 65fc806e1..117f79208 100644
--- a/tests/cluster/tests/includes/init-tests.tcl
+++ b/tests/cluster/tests/includes/init-tests.tcl
@@ -28,8 +28,10 @@ test "Cluster nodes are reachable" {
 test "Cluster nodes hard reset" {
     foreach_redis_id id {
         catch {R $id flushall} ; # May fail for readonly slaves.
+        R $id MULTI
         R $id cluster reset hard
         R $id cluster set-config-epoch [expr {$id+1}]
+        R $id EXEC
         R $id config set cluster-node-timeout 3000
         R $id config set cluster-slave-validity-factor 10
         R $id config rewrite