diff --git a/TLS.md b/TLS.md new file mode 100644 index 000000000..76fe0be2e --- /dev/null +++ b/TLS.md @@ -0,0 +1,106 @@ +TLS Support -- Work In Progress +=============================== + +This is a brief note to capture current thoughts/ideas and track pending action +items. + +Getting Started +--------------- + +### Building + +To build with TLS support you'll need OpenSSL development libraries (e.g. +libssl-dev on Debian/Ubuntu). + +Run `make BUILD_TLS=yes`. + +### Tests + +To run Redis test suite with TLS, you'll need TLS support for TCL (i.e. +`tcl-tls` package on Debian/Ubuntu). + +1. Run `./utils/gen-test-certs.sh` to generate a root CA and a server + certificate. + +2. Run `./runtest --tls` or `./runtest-cluster --tls` to run Redis and Redis + Cluster tests in TLS mode. + +### Running manually + +To manually run a Redis server with TLS mode (assuming `gen-test-certs.sh` was +invoked so sample certificates/keys are available): + + ./src/redis-server --tls-port 6379 --port 0 \ + --tls-cert-file ./tests/tls/redis.crt \ + --tls-key-file ./tests/tls/redis.key \ + --tls-ca-cert-file ./tests/tls/ca.crt + +To connect to this Redis server with `redis-cli`: + + ./src/redis-cli --tls \ + --cert ./tests/tls/redis.crt \ + --key ./tests/tls/redis.key \ + --cacert ./tests/tls/ca.crt + +This will disable TCP and enable TLS on port 6379. It's also possible to have +both TCP and TLS available, but you'll need to assign different ports. + +To make a Replica connect to the master using TLS, use `--tls-replication yes`, +and to make Redis Cluster use TLS across nodes use `--tls-cluster yes`. + +Connections +----------- + +All socket operations now go through a connection abstraction layer that hides +I/O and read/write event handling from the caller. + +**Multi-threading I/O is not currently supported for TLS**, as a TLS connection +needs to do its own manipulation of AE events which is not thread safe. The +solution is probably to manage independent AE loops for I/O threads and longer +term association of connections with threads. This may potentially improve +overall performance as well. + +Sync IO for TLS is currently implemented in a hackish way, i.e. making the +socket blocking and configuring socket-level timeout. This means the timeout +value may not be so accurate, and there would be a lot of syscall overhead. +However I believe that getting rid of syncio completely in favor of pure async +work is probably a better move than trying to fix that. For replication it would +probably not be so hard. For cluster keys migration it might be more difficult, +but there are probably other good reasons to improve that part anyway. + +To-Do List +========== + +Additional TLS Features +----------------------- + +1. Add metrics to INFO? +2. Add session caching support. Check if/how it's handled by clients to assess + how useful/important it is. + +redis-benchmark +--------------- + +The current implementation is a mix of using hiredis for parsing and basic +networking (establishing connections), but directly manipulating sockets for +most actions. + +This will need to be cleaned up for proper TLS support. The best approach is +probably to migrate to hiredis async mode. + +redis-cli +--------- + +1. Add support for TLS in --slave and --rdb modes. + +Others +------ + +Consider the implications of allowing TLS to be configured on a separate port, +making Redis listening on multiple ports. + +This impacts many things, like +1. Startup banner port notification +2. Proctitle +3. How slaves announce themselves +4. Cluster bus port calculation diff --git a/deps/Makefile b/deps/Makefile index eb35c1e1f..700867f3b 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -41,9 +41,13 @@ distclean: .PHONY: distclean +ifeq ($(BUILD_TLS),yes) + HIREDIS_MAKE_FLAGS = USE_SSL=1 +endif + hiredis: .make-prerequisites @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) - cd hiredis && $(MAKE) static + cd hiredis && $(MAKE) static $(HIREDIS_MAKE_FLAGS) .PHONY: hiredis diff --git a/redis.conf b/redis.conf index 50ba823ac..408426f15 100644 --- a/redis.conf +++ b/redis.conf @@ -129,6 +129,76 @@ timeout 0 # Redis default starting with Redis 3.2.1. tcp-keepalive 300 +################################# TLS/SSL ##################################### + +# By default, TLS/SSL is disabled. To enable it, the "tls-port" configuration +# directive can be used to define TLS-listening ports. To enable TLS on the +# default port, use: +# +# port 0 +# tls-port 6379 + +# Configure a X.509 certificate and private key to use for authenticating the +# server to connected clients, masters or cluster peers. These files should be +# PEM formatted. +# +# tls-cert-file redis.crt tls-key-file redis.key + +# Configure a DH parameters file to enable Diffie-Hellman (DH) key exchange: +# +# tls-dh-params-file redis.dh + +# Configure a CA certificate(s) bundle or directory to authenticate TLS/SSL +# clients and peers. Redis requires an explicit configuration of at least one +# of these, and will not implicitly use the system wide configuration. +# +# tls-ca-cert-file ca.crt +# tls-ca-cert-dir /etc/ssl/certs + +# If TLS/SSL clients are required to authenticate using a client side +# certificate, use this directive. +# +# Note: this applies to all incoming clients, including replicas. +# +# tls-auth-clients yes + +# If TLS/SSL should be used when connecting as a replica to a master, enable +# this configuration directive: +# +# tls-replication yes + +# If TLS/SSL should be used for the Redis Cluster bus, enable this configuration +# directive. +# +# NOTE: If TLS/SSL is enabled for Cluster Bus, mutual authentication is always +# enforced. +# +# tls-cluster yes + +# Explicitly specify TLS versions to support. Allowed values are case insensitive +# and include "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" (OpenSSL >= 1.1.1) or +# "default" which is currently >= TLSv1.1. +# +# tls-protocols TLSv1.2 + +# Configure allowed ciphers. See the ciphers(1ssl) manpage for more information +# about the syntax of this string. +# +# Note: this configuration applies only to <= TLSv1.2. +# +# tls-ciphers DEFAULT:!MEDIUM + +# Configure allowed TLSv1.3 ciphersuites. See the ciphers(1ssl) manpage for more +# information about the syntax of this string, and specifically for TLSv1.3 +# ciphersuites. +# +# tls-ciphersuites TLS_CHACHA20_POLY1305_SHA256 + +# When choosing a cipher, use the server's preference instead of the client +# preference. By default, the server follows the client's preference. +# +# tls-prefer-server-cipher yes + ################################# GENERAL ##################################### # By default Redis does not run as a daemon. Use 'yes' if you need it. diff --git a/src/Makefile b/src/Makefile index a76adbf4f..9fc230f94 100644 --- a/src/Makefile +++ b/src/Makefile @@ -93,6 +93,8 @@ else ifeq ($(uname_S),Darwin) # Darwin FINAL_LIBS+= -ldl + OPENSSL_CFLAGS=-I/usr/local/opt/openssl/include + OPENSSL_LDFLAGS=-L/usr/local/opt/openssl/lib else ifeq ($(uname_S),AIX) # AIX @@ -145,6 +147,12 @@ ifeq ($(MALLOC),jemalloc) FINAL_LIBS := ../deps/jemalloc/lib/libjemalloc.a $(FINAL_LIBS) endif +ifeq ($(BUILD_TLS),yes) + FINAL_CFLAGS+=-DUSE_OPENSSL $(OPENSSL_CFLAGS) + FINAL_LDFLAGS+=$(OPENSSL_LDFLAGS) + FINAL_LIBS += ../deps/hiredis/libhiredis_ssl.a -lssl -lcrypto +endif + REDIS_CC=$(QUIET_CC)$(CC) $(FINAL_CFLAGS) REDIS_LD=$(QUIET_LINK)$(CC) $(FINAL_LDFLAGS) REDIS_INSTALL=$(QUIET_INSTALL)$(INSTALL) @@ -164,7 +172,7 @@ endif REDIS_SERVER_NAME=redis-server REDIS_SENTINEL_NAME=redis-sentinel -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o gopher.o tracking.o sha256.o +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o gopher.o tracking.o connection.o tls.o sha256.o REDIS_CLI_NAME=redis-cli REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o siphash.o crc16.o REDIS_BENCHMARK_NAME=redis-benchmark diff --git a/src/ae.c b/src/ae.c index 53629ef77..2c1dae512 100644 --- a/src/ae.c +++ b/src/ae.c @@ -76,6 +76,7 @@ aeEventLoop *aeCreateEventLoop(int setsize) { eventLoop->maxfd = -1; eventLoop->beforesleep = NULL; eventLoop->aftersleep = NULL; + eventLoop->flags = 0; if (aeApiCreate(eventLoop) == -1) goto err; /* Events with mask == AE_NONE are not set. So let's initialize the * vector with it. */ @@ -97,6 +98,14 @@ int aeGetSetSize(aeEventLoop *eventLoop) { return eventLoop->setsize; } +/* Tells the next iteration/s of the event processing to set timeout of 0. */ +void aeSetDontWait(aeEventLoop *eventLoop, int noWait) { + if (noWait) + eventLoop->flags |= AE_DONT_WAIT; + else + eventLoop->flags &= ~AE_DONT_WAIT; +} + /* Resize the maximum set size of the event loop. * If the requested set size is smaller than the current set size, but * there is already a file descriptor in use that is >= the requested @@ -406,6 +415,11 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags) } } + if (eventLoop->flags & AE_DONT_WAIT) { + tv.tv_sec = tv.tv_usec = 0; + tvp = &tv; + } + /* Call the multiplexing API, will return only on timeout or when * some event fires. */ numevents = aeApiPoll(eventLoop, tvp); diff --git a/src/ae.h b/src/ae.h index 184fe3d1b..9acd72434 100644 --- a/src/ae.h +++ b/src/ae.h @@ -106,6 +106,7 @@ typedef struct aeEventLoop { void *apidata; /* This is used for polling API specific data */ aeBeforeSleepProc *beforesleep; aeBeforeSleepProc *aftersleep; + int flags; } aeEventLoop; /* Prototypes */ @@ -128,5 +129,6 @@ void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep); int aeGetSetSize(aeEventLoop *eventLoop); int aeResizeSetSize(aeEventLoop *eventLoop, int setsize); +void aeSetDontWait(aeEventLoop *eventLoop, int noWait); #endif diff --git a/src/ae_epoll.c b/src/ae_epoll.c index 410aac70d..fa197297e 100644 --- a/src/ae_epoll.c +++ b/src/ae_epoll.c @@ -121,8 +121,8 @@ static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { if (e->events & EPOLLIN) mask |= AE_READABLE; if (e->events & EPOLLOUT) mask |= AE_WRITABLE; - if (e->events & EPOLLERR) mask |= AE_WRITABLE; - if (e->events & EPOLLHUP) mask |= AE_WRITABLE; + if (e->events & EPOLLERR) mask |= AE_WRITABLE|AE_READABLE; + if (e->events & EPOLLHUP) mask |= AE_WRITABLE|AE_READABLE; eventLoop->fired[j].fd = e->data.fd; eventLoop->fired[j].mask = mask; } diff --git a/src/anet.c b/src/anet.c index 2088f4fb1..46ea7e145 100644 --- a/src/anet.c +++ b/src/anet.c @@ -279,8 +279,8 @@ static int anetCreateSocket(char *err, int domain) { #define ANET_CONNECT_NONE 0 #define ANET_CONNECT_NONBLOCK 1 #define ANET_CONNECT_BE_BINDING 2 /* Best effort binding. */ -static int anetTcpGenericConnect(char *err, char *addr, int port, - char *source_addr, int flags) +static int anetTcpGenericConnect(char *err, const char *addr, int port, + const char *source_addr, int flags) { int s = ANET_ERR, rv; char portstr[6]; /* strlen("65535") + 1; */ @@ -359,31 +359,31 @@ end: } } -int anetTcpConnect(char *err, char *addr, int port) +int anetTcpConnect(char *err, const char *addr, int port) { return anetTcpGenericConnect(err,addr,port,NULL,ANET_CONNECT_NONE); } -int anetTcpNonBlockConnect(char *err, char *addr, int port) +int anetTcpNonBlockConnect(char *err, const char *addr, int port) { return anetTcpGenericConnect(err,addr,port,NULL,ANET_CONNECT_NONBLOCK); } -int anetTcpNonBlockBindConnect(char *err, char *addr, int port, - char *source_addr) +int anetTcpNonBlockBindConnect(char *err, const char *addr, int port, + const char *source_addr) { return anetTcpGenericConnect(err,addr,port,source_addr, ANET_CONNECT_NONBLOCK); } -int anetTcpNonBlockBestEffortBindConnect(char *err, char *addr, int port, - char *source_addr) +int anetTcpNonBlockBestEffortBindConnect(char *err, const char *addr, int port, + const char *source_addr) { return anetTcpGenericConnect(err,addr,port,source_addr, ANET_CONNECT_NONBLOCK|ANET_CONNECT_BE_BINDING); } -int anetUnixGenericConnect(char *err, char *path, int flags) +int anetUnixGenericConnect(char *err, const char *path, int flags) { int s; struct sockaddr_un sa; @@ -411,12 +411,12 @@ int anetUnixGenericConnect(char *err, char *path, int flags) return s; } -int anetUnixConnect(char *err, char *path) +int anetUnixConnect(char *err, const char *path) { return anetUnixGenericConnect(err,path,ANET_CONNECT_NONE); } -int anetUnixNonBlockConnect(char *err, char *path) +int anetUnixNonBlockConnect(char *err, const char *path) { return anetUnixGenericConnect(err,path,ANET_CONNECT_NONBLOCK); } diff --git a/src/anet.h b/src/anet.h index dd735240d..23f19643c 100644 --- a/src/anet.h +++ b/src/anet.h @@ -49,12 +49,12 @@ #undef ip_len #endif -int anetTcpConnect(char *err, char *addr, int port); -int anetTcpNonBlockConnect(char *err, char *addr, int port); -int anetTcpNonBlockBindConnect(char *err, char *addr, int port, char *source_addr); -int anetTcpNonBlockBestEffortBindConnect(char *err, char *addr, int port, char *source_addr); -int anetUnixConnect(char *err, char *path); -int anetUnixNonBlockConnect(char *err, char *path); +int anetTcpConnect(char *err, const char *addr, int port); +int anetTcpNonBlockConnect(char *err, const char *addr, int port); +int anetTcpNonBlockBindConnect(char *err, const char *addr, int port, const char *source_addr); +int anetTcpNonBlockBestEffortBindConnect(char *err, const char *addr, int port, const char *source_addr); +int anetUnixConnect(char *err, const char *path); +int anetUnixNonBlockConnect(char *err, const char *path); int anetRead(int fd, char *buf, int count); int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len); int anetResolveIP(char *err, char *host, char *ipbuf, size_t ipbuf_len); diff --git a/src/aof.c b/src/aof.c index 4e6af7c1c..32684eb89 100644 --- a/src/aof.c +++ b/src/aof.c @@ -385,6 +385,10 @@ void flushAppendOnlyFile(int force) { * there is much to do about the whole server stopping for power problems * or alike */ + if (server.aof_flush_sleep && sdslen(server.aof_buf)) { + usleep(server.aof_flush_sleep); + } + latencyStartMonitor(latency); nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf)); latencyEndMonitor(latency); @@ -652,7 +656,7 @@ struct client *createFakeClient(void) { struct client *c = zmalloc(sizeof(*c)); selectDb(c,0); - c->fd = -1; + c->conn = NULL; c->name = NULL; c->querybuf = sdsempty(); c->querybuf_peak = 0; @@ -835,6 +839,8 @@ int loadAppendOnlyFile(char *filename) { freeFakeClientArgv(fakeClient); fakeClient->cmd = NULL; if (server.aof_load_truncated) valid_up_to = ftello(fp); + if (server.key_load_delay) + usleep(server.key_load_delay); } /* This point can only be reached when EOF is reached without errors. diff --git a/src/cluster.c b/src/cluster.c index 93be2aa32..a7d8a02c3 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -49,7 +49,7 @@ clusterNode *myself = NULL; clusterNode *createClusterNode(char *nodename, int flags); int clusterAddNode(clusterNode *node); void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); -void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask); +void clusterReadHandler(connection *conn); void clusterSendPing(clusterLink *link, int type); void clusterSendFail(char *nodename); void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request); @@ -477,7 +477,8 @@ void clusterInit(void) { /* Port sanity check II * The other handshake port check is triggered too late to stop * us from trying to use a too-high cluster port number. */ - if (server.port > (65535-CLUSTER_PORT_INCR)) { + int port = server.tls_cluster ? server.tls_port : server.port; + if (port > (65535-CLUSTER_PORT_INCR)) { serverLog(LL_WARNING, "Redis port number too high. " "Cluster communication port is 10,000 port " "numbers higher than your Redis port. " @@ -485,8 +486,7 @@ void clusterInit(void) { "lower than 55535."); exit(1); } - - if (listenToPort(server.port+CLUSTER_PORT_INCR, + if (listenToPort(port+CLUSTER_PORT_INCR, server.cfd,&server.cfd_count) == C_ERR) { exit(1); @@ -508,8 +508,8 @@ void clusterInit(void) { /* Set myself->port / cport to my listening ports, we'll just need to * discover the IP address via MEET messages. */ - myself->port = server.port; - myself->cport = server.port+CLUSTER_PORT_INCR; + myself->port = port; + myself->cport = port+CLUSTER_PORT_INCR; if (server.cluster_announce_port) myself->port = server.cluster_announce_port; if (server.cluster_announce_bus_port) @@ -593,7 +593,7 @@ clusterLink *createClusterLink(clusterNode *node) { link->sndbuf = sdsempty(); link->rcvbuf = sdsempty(); link->node = node; - link->fd = -1; + link->conn = NULL; return link; } @@ -601,23 +601,45 @@ clusterLink *createClusterLink(clusterNode *node) { * This function will just make sure that the original node associated * with this link will have the 'link' field set to NULL. */ void freeClusterLink(clusterLink *link) { - if (link->fd != -1) { - aeDeleteFileEvent(server.el, link->fd, AE_READABLE|AE_WRITABLE); + if (link->conn) { + connClose(link->conn); + link->conn = NULL; } sdsfree(link->sndbuf); sdsfree(link->rcvbuf); if (link->node) link->node->link = NULL; - close(link->fd); zfree(link); } +static void clusterConnAcceptHandler(connection *conn) { + clusterLink *link; + + if (connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_VERBOSE, + "Error accepting cluster node connection: %s", connGetLastError(conn)); + connClose(conn); + return; + } + + /* Create a link object we use to handle the connection. + * It gets passed to the readable handler when data is available. + * Initiallly the link->node pointer is set to NULL as we don't know + * which node is, but the right node is references once we know the + * node identity. */ + link = createClusterLink(NULL); + link->conn = conn; + connSetPrivateData(conn, link); + + /* Register read handler */ + connSetReadHandler(conn, clusterReadHandler); +} + #define MAX_CLUSTER_ACCEPTS_PER_CALL 1000 void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { int cport, cfd; int max = MAX_CLUSTER_ACCEPTS_PER_CALL; char cip[NET_IP_STR_LEN]; - clusterLink *link; UNUSED(el); UNUSED(mask); UNUSED(privdata); @@ -634,19 +656,24 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { "Error accepting cluster node: %s", server.neterr); return; } - anetNonBlock(NULL,cfd); - anetEnableTcpNoDelay(NULL,cfd); + + connection *conn = server.tls_cluster ? connCreateAcceptedTLS(cfd,1) : connCreateAcceptedSocket(cfd); + connNonBlock(conn); + connEnableTcpNoDelay(conn); /* Use non-blocking I/O for cluster messages. */ - serverLog(LL_VERBOSE,"Accepted cluster node %s:%d", cip, cport); - /* Create a link object we use to handle the connection. - * It gets passed to the readable handler when data is available. - * Initiallly the link->node pointer is set to NULL as we don't know - * which node is, but the right node is references once we know the - * node identity. */ - link = createClusterLink(NULL); - link->fd = cfd; - aeCreateFileEvent(server.el,cfd,AE_READABLE,clusterReadHandler,link); + serverLog(LL_VERBOSE,"Accepting cluster node connection from %s:%d", cip, cport); + + /* Accept the connection now. connAccept() may call our handler directly + * or schedule it for later depending on connection implementation. + */ + if (connAccept(conn, clusterConnAcceptHandler) == C_ERR) { + serverLog(LL_VERBOSE, + "Error accepting cluster node connection: %s", + connGetLastError(conn)); + connClose(conn); + return; + } } } @@ -1447,7 +1474,7 @@ void nodeIp2String(char *buf, clusterLink *link, char *announced_ip) { memcpy(buf,announced_ip,NET_IP_STR_LEN); buf[NET_IP_STR_LEN-1] = '\0'; /* We are not sure the input is sane. */ } else { - anetPeerToString(link->fd, buf, NET_IP_STR_LEN, NULL); + connPeerToString(link->conn, buf, NET_IP_STR_LEN, NULL); } } @@ -1751,7 +1778,7 @@ int clusterProcessPacket(clusterLink *link) { { char ip[NET_IP_STR_LEN]; - if (anetSockName(link->fd,ip,sizeof(ip),NULL) != -1 && + if (connSockName(link->conn,ip,sizeof(ip),NULL) != -1 && strcmp(ip,myself->ip)) { memcpy(myself->ip,ip,NET_IP_STR_LEN); @@ -2118,35 +2145,76 @@ void handleLinkIOError(clusterLink *link) { /* Send data. This is handled using a trivial send buffer that gets * consumed by write(). We don't try to optimize this for speed too much * as this is a very low traffic channel. */ -void clusterWriteHandler(aeEventLoop *el, int fd, void *privdata, int mask) { - clusterLink *link = (clusterLink*) privdata; +void clusterWriteHandler(connection *conn) { + clusterLink *link = connGetPrivateData(conn); ssize_t nwritten; - UNUSED(el); - UNUSED(mask); - nwritten = write(fd, link->sndbuf, sdslen(link->sndbuf)); + nwritten = connWrite(conn, link->sndbuf, sdslen(link->sndbuf)); if (nwritten <= 0) { serverLog(LL_DEBUG,"I/O error writing to node link: %s", - (nwritten == -1) ? strerror(errno) : "short write"); + (nwritten == -1) ? connGetLastError(conn) : "short write"); handleLinkIOError(link); return; } sdsrange(link->sndbuf,nwritten,-1); if (sdslen(link->sndbuf) == 0) - aeDeleteFileEvent(server.el, link->fd, AE_WRITABLE); + connSetWriteHandler(link->conn, NULL); +} + +/* A connect handler that gets called when a connection to another node + * gets established. + */ +void clusterLinkConnectHandler(connection *conn) { + clusterLink *link = connGetPrivateData(conn); + clusterNode *node = link->node; + + /* Check if connection succeeded */ + if (connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_VERBOSE, "Connection with Node %.40s at %s:%d failed: %s", + node->name, node->ip, node->cport, + connGetLastError(conn)); + freeClusterLink(link); + return; + } + + /* Register a read handler from now on */ + connSetReadHandler(conn, clusterReadHandler); + + /* Queue a PING in the new connection ASAP: this is crucial + * to avoid false positives in failure detection. + * + * If the node is flagged as MEET, we send a MEET message instead + * of a PING one, to force the receiver to add us in its node + * table. */ + mstime_t old_ping_sent = node->ping_sent; + clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? + CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); + if (old_ping_sent) { + /* If there was an active ping before the link was + * disconnected, we want to restore the ping time, otherwise + * replaced by the clusterSendPing() call. */ + node->ping_sent = old_ping_sent; + } + /* We can clear the flag after the first packet is sent. + * If we'll never receive a PONG, we'll never send new packets + * to this node. Instead after the PONG is received and we + * are no longer in meet/handshake status, we want to send + * normal PING packets. */ + node->flags &= ~CLUSTER_NODE_MEET; + + serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d", + node->name, node->ip, node->cport); } /* Read data. Try to read the first field of the header first to check the * full length of the packet. When a whole packet is in memory this function * will call the function to process the packet. And so forth. */ -void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) { +void clusterReadHandler(connection *conn) { clusterMsg buf[1]; ssize_t nread; clusterMsg *hdr; - clusterLink *link = (clusterLink*) privdata; + clusterLink *link = connGetPrivateData(conn); unsigned int readlen, rcvbuflen; - UNUSED(el); - UNUSED(mask); while(1) { /* Read as long as there is data to read. */ rcvbuflen = sdslen(link->rcvbuf); @@ -2174,13 +2242,13 @@ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) { if (readlen > sizeof(buf)) readlen = sizeof(buf); } - nread = read(fd,buf,readlen); - if (nread == -1 && errno == EAGAIN) return; /* No more data ready. */ + nread = connRead(conn,buf,readlen); + if (nread == -1 && (connGetState(conn) == CONN_STATE_CONNECTED)) return; /* No more data ready. */ if (nread <= 0) { /* I/O error... */ serverLog(LL_DEBUG,"I/O error reading from node link: %s", - (nread == 0) ? "connection closed" : strerror(errno)); + (nread == 0) ? "connection closed" : connGetLastError(conn)); handleLinkIOError(link); return; } else { @@ -2209,8 +2277,7 @@ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) { * from event handlers that will do stuff with the same link later. */ void clusterSendMessage(clusterLink *link, unsigned char *msg, size_t msglen) { if (sdslen(link->sndbuf) == 0 && msglen != 0) - aeCreateFileEvent(server.el,link->fd,AE_WRITABLE|AE_BARRIER, - clusterWriteHandler,link); + connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1); link->sndbuf = sdscatlen(link->sndbuf, msg, msglen); @@ -2276,11 +2343,12 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { } /* Handle cluster-announce-port as well. */ + int port = server.tls_cluster ? server.tls_port : server.port; int announced_port = server.cluster_announce_port ? - server.cluster_announce_port : server.port; + server.cluster_announce_port : port; int announced_cport = server.cluster_announce_bus_port ? server.cluster_announce_bus_port : - (server.port + CLUSTER_PORT_INCR); + (port + CLUSTER_PORT_INCR); memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots)); memset(hdr->slaveof,0,CLUSTER_NAMELEN); @@ -3385,13 +3453,11 @@ void clusterCron(void) { } if (node->link == NULL) { - int fd; - mstime_t old_ping_sent; - clusterLink *link; - - fd = anetTcpNonBlockBindConnect(server.neterr, node->ip, - node->cport, NET_FIRST_BIND_ADDR); - if (fd == -1) { + clusterLink *link = createClusterLink(node); + link->conn = server.tls_cluster ? connCreateTLS() : connCreateSocket(); + connSetPrivateData(link->conn, link); + if (connConnect(link->conn, node->ip, node->cport, NET_FIRST_BIND_ADDR, + clusterLinkConnectHandler) == -1) { /* We got a synchronous error from connect before * clusterSendPing() had a chance to be called. * If node->ping_sent is zero, failure detection can't work, @@ -3401,37 +3467,11 @@ void clusterCron(void) { serverLog(LL_DEBUG, "Unable to connect to " "Cluster Node [%s]:%d -> %s", node->ip, node->cport, server.neterr); + + freeClusterLink(link); continue; } - link = createClusterLink(node); - link->fd = fd; node->link = link; - aeCreateFileEvent(server.el,link->fd,AE_READABLE, - clusterReadHandler,link); - /* Queue a PING in the new connection ASAP: this is crucial - * to avoid false positives in failure detection. - * - * If the node is flagged as MEET, we send a MEET message instead - * of a PING one, to force the receiver to add us in its node - * table. */ - old_ping_sent = node->ping_sent; - clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? - CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); - if (old_ping_sent) { - /* If there was an active ping before the link was - * disconnected, we want to restore the ping time, otherwise - * replaced by the clusterSendPing() call. */ - node->ping_sent = old_ping_sent; - } - /* We can clear the flag after the first packet is sent. - * If we'll never receive a PONG, we'll never send new packets - * to this node. Instead after the PONG is received and we - * are no longer in meet/handshake status, we want to send - * normal PING packets. */ - node->flags &= ~CLUSTER_NODE_MEET; - - serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d", - node->name, node->ip, node->cport); } } dictReleaseIterator(di); @@ -4942,7 +4982,7 @@ void restoreCommand(client *c) { #define MIGRATE_SOCKET_CACHE_TTL 10 /* close cached sockets after 10 sec. */ typedef struct migrateCachedSocket { - int fd; + connection *conn; long last_dbid; time_t last_use_time; } migrateCachedSocket; @@ -4959,7 +4999,7 @@ typedef struct migrateCachedSocket { * should be called so that the connection will be created from scratch * the next time. */ migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long timeout) { - int fd; + connection *conn; sds name = sdsempty(); migrateCachedSocket *cs; @@ -4979,34 +5019,27 @@ migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long ti /* Too many items, drop one at random. */ dictEntry *de = dictGetRandomKey(server.migrate_cached_sockets); cs = dictGetVal(de); - close(cs->fd); + connClose(cs->conn); zfree(cs); dictDelete(server.migrate_cached_sockets,dictGetKey(de)); } /* Create the socket */ - fd = anetTcpNonBlockConnect(server.neterr,c->argv[1]->ptr, - atoi(c->argv[2]->ptr)); - if (fd == -1) { - sdsfree(name); - addReplyErrorFormat(c,"Can't connect to target node: %s", - server.neterr); - return NULL; - } - anetEnableTcpNoDelay(server.neterr,fd); - - /* Check if it connects within the specified timeout. */ - if ((aeWait(fd,AE_WRITABLE,timeout) & AE_WRITABLE) == 0) { - sdsfree(name); + conn = server.tls_cluster ? connCreateTLS() : connCreateSocket(); + if (connBlockingConnect(conn, c->argv[1]->ptr, atoi(c->argv[2]->ptr), timeout) + != C_OK) { addReplySds(c, sdsnew("-IOERR error or timeout connecting to the client\r\n")); - close(fd); + connClose(conn); + sdsfree(name); return NULL; } + connEnableTcpNoDelay(conn); /* Add to the cache and return it to the caller. */ cs = zmalloc(sizeof(*cs)); - cs->fd = fd; + cs->conn = conn; + cs->last_dbid = -1; cs->last_use_time = server.unixtime; dictAdd(server.migrate_cached_sockets,name,cs); @@ -5027,7 +5060,7 @@ void migrateCloseSocket(robj *host, robj *port) { return; } - close(cs->fd); + connClose(cs->conn); zfree(cs); dictDelete(server.migrate_cached_sockets,name); sdsfree(name); @@ -5041,7 +5074,7 @@ void migrateCloseTimedoutSockets(void) { migrateCachedSocket *cs = dictGetVal(de); if ((server.unixtime - cs->last_use_time) > MIGRATE_SOCKET_CACHE_TTL) { - close(cs->fd); + connClose(cs->conn); zfree(cs); dictDelete(server.migrate_cached_sockets,dictGetKey(de)); } @@ -5223,7 +5256,7 @@ try_again: while ((towrite = sdslen(buf)-pos) > 0) { towrite = (towrite > (64*1024) ? (64*1024) : towrite); - nwritten = syncWrite(cs->fd,buf+pos,towrite,timeout); + nwritten = connSyncWrite(cs->conn,buf+pos,towrite,timeout); if (nwritten != (signed)towrite) { write_error = 1; goto socket_err; @@ -5237,11 +5270,11 @@ try_again: char buf2[1024]; /* Restore reply. */ /* Read the AUTH reply if needed. */ - if (password && syncReadLine(cs->fd, buf0, sizeof(buf0), timeout) <= 0) + if (password && connSyncReadLine(cs->conn, buf0, sizeof(buf0), timeout) <= 0) goto socket_err; /* Read the SELECT reply if needed. */ - if (select && syncReadLine(cs->fd, buf1, sizeof(buf1), timeout) <= 0) + if (select && connSyncReadLine(cs->conn, buf1, sizeof(buf1), timeout) <= 0) goto socket_err; /* Read the RESTORE replies. */ @@ -5256,7 +5289,7 @@ try_again: if (!copy) newargv = zmalloc(sizeof(robj*)*(num_keys+1)); for (j = 0; j < num_keys; j++) { - if (syncReadLine(cs->fd, buf2, sizeof(buf2), timeout) <= 0) { + if (connSyncReadLine(cs->conn, buf2, sizeof(buf2), timeout) <= 0) { socket_error = 1; break; } diff --git a/src/cluster.h b/src/cluster.h index 571b9c543..ffbb29f0d 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -40,7 +40,7 @@ struct clusterNode; /* clusterLink encapsulates everything needed to talk with a remote node. */ typedef struct clusterLink { mstime_t ctime; /* Link creation time */ - int fd; /* TCP socket file descriptor */ + connection *conn; /* Connection to remote node */ sds sndbuf; /* Packet send buffer */ sds rcvbuf; /* Packet reception buffer */ struct clusterNode *node; /* Node related to this link if any, or NULL */ diff --git a/src/config.c b/src/config.c index 72fb038ea..505dabc9c 100644 --- a/src/config.c +++ b/src/config.c @@ -220,7 +220,7 @@ void queueLoadModule(sds path, sds *argv, int argc) { } void loadServerConfigFromString(char *config) { - char *err = NULL; + const char *err = NULL; int linenum = 0, totlines, i; int slaveof_linenum = 0; sds *lines; @@ -515,6 +515,12 @@ void loadServerConfigFromString(char *config) { err = "rdb-key-save-delay can't be negative"; goto loaderr; } + } else if (!strcasecmp(argv[0],"key-load-delay") && argc==2) { + server.key_load_delay = atoi(argv[1]); + if (server.key_load_delay < 0) { + err = "key-load-delay can't be negative"; + goto loaderr; + } } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) { if (strlen(argv[1]) > CONFIG_AUTHPASS_MAX_LEN) { err = "Password is longer than CONFIG_AUTHPASS_MAX_LEN"; @@ -797,6 +803,45 @@ void loadServerConfigFromString(char *config) { err = sentinelHandleConfiguration(argv+1,argc-1); if (err) goto loaderr; } +#ifdef USE_OPENSSL + } else if (!strcasecmp(argv[0],"tls-port") && argc == 2) { + server.tls_port = atoi(argv[1]); + if (server.port < 0 || server.port > 65535) { + err = "Invalid tls-port"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"tls-cluster") && argc == 2) { + server.tls_cluster = yesnotoi(argv[1]); + } else if (!strcasecmp(argv[0],"tls-replication") && argc == 2) { + server.tls_replication = yesnotoi(argv[1]); + } else if (!strcasecmp(argv[0],"tls-auth-clients") && argc == 2) { + server.tls_auth_clients = yesnotoi(argv[1]); + } else if (!strcasecmp(argv[0],"tls-cert-file") && argc == 2) { + zfree(server.tls_ctx_config.cert_file); + server.tls_ctx_config.cert_file = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"tls-key-file") && argc == 2) { + zfree(server.tls_ctx_config.key_file); + server.tls_ctx_config.key_file = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"tls-dh-params-file") && argc == 2) { + zfree(server.tls_ctx_config.dh_params_file); + server.tls_ctx_config.dh_params_file = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"tls-ca-cert-file") && argc == 2) { + zfree(server.tls_ctx_config.ca_cert_file); + server.tls_ctx_config.ca_cert_file = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"tls-ca-cert-dir") && argc == 2) { + zfree(server.tls_ctx_config.ca_cert_dir); + server.tls_ctx_config.ca_cert_dir = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"tls-protocols") && argc >= 2) { + zfree(server.tls_ctx_config.protocols); + server.tls_ctx_config.protocols = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"tls-ciphers") && argc == 2) { + zfree(server.tls_ctx_config.ciphers); + server.tls_ctx_config.ciphers = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"tls-ciphersuites") && argc == 2) { + zfree(server.tls_ctx_config.ciphersuites); + server.tls_ctx_config.ciphersuites = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"tls-prefer-server-ciphers") && argc == 2) { + server.tls_ctx_config.prefer_server_ciphers = yesnotoi(argv[1]); +#endif /* USE_OPENSSL */ } else { err = "Bad directive or wrong number of arguments"; goto loaderr; } @@ -1170,6 +1215,8 @@ void configSetCommand(client *c) { "replica-priority",server.slave_priority,0,INT_MAX) { } config_set_numerical_field( "rdb-key-save-delay",server.rdb_key_save_delay,0,LLONG_MAX) { + } config_set_numerical_field( + "key-load-delay",server.key_load_delay,0,LLONG_MAX) { } config_set_numerical_field( "slave-announce-port",server.slave_announce_port,0,65535) { } config_set_numerical_field( @@ -1239,7 +1286,100 @@ void configSetCommand(client *c) { "appendfsync",server.aof_fsync,aof_fsync_enum) { } config_set_enum_field( "repl-diskless-load",server.repl_diskless_load,repl_diskless_load_enum) { - +#ifdef USE_OPENSSL + /* TLS fields. */ + } config_set_special_field("tls-cert-file") { + redisTLSContextConfig tmpctx = server.tls_ctx_config; + tmpctx.cert_file = (char *) o->ptr; + if (tlsConfigure(&tmpctx) == C_ERR) { + addReplyError(c, + "Unable to configure tls-cert-file. Check server logs."); + return; + } + zfree(server.tls_ctx_config.cert_file); + server.tls_ctx_config.cert_file = zstrdup(o->ptr); + } config_set_special_field("tls-key-file") { + redisTLSContextConfig tmpctx = server.tls_ctx_config; + tmpctx.key_file = (char *) o->ptr; + if (tlsConfigure(&tmpctx) == C_ERR) { + addReplyError(c, + "Unable to configure tls-key-file. Check server logs."); + return; + } + zfree(server.tls_ctx_config.key_file); + server.tls_ctx_config.key_file = zstrdup(o->ptr); + } config_set_special_field("tls-dh-params-file") { + redisTLSContextConfig tmpctx = server.tls_ctx_config; + tmpctx.dh_params_file = (char *) o->ptr; + if (tlsConfigure(&tmpctx) == C_ERR) { + addReplyError(c, + "Unable to configure tls-dh-params-file. Check server logs."); + return; + } + zfree(server.tls_ctx_config.dh_params_file); + server.tls_ctx_config.dh_params_file = zstrdup(o->ptr); + } config_set_special_field("tls-ca-cert-file") { + redisTLSContextConfig tmpctx = server.tls_ctx_config; + tmpctx.ca_cert_file = (char *) o->ptr; + if (tlsConfigure(&tmpctx) == C_ERR) { + addReplyError(c, + "Unable to configure tls-ca-cert-file. Check server logs."); + return; + } + zfree(server.tls_ctx_config.ca_cert_file); + server.tls_ctx_config.ca_cert_file = zstrdup(o->ptr); + } config_set_special_field("tls-ca-cert-dir") { + redisTLSContextConfig tmpctx = server.tls_ctx_config; + tmpctx.ca_cert_dir = (char *) o->ptr; + if (tlsConfigure(&tmpctx) == C_ERR) { + addReplyError(c, + "Unable to configure tls-ca-cert-dir. Check server logs."); + return; + } + zfree(server.tls_ctx_config.ca_cert_dir); + server.tls_ctx_config.ca_cert_dir = zstrdup(o->ptr); + } config_set_bool_field("tls-auth-clients", server.tls_auth_clients) { + } config_set_bool_field("tls-replication", server.tls_replication) { + } config_set_bool_field("tls-cluster", server.tls_cluster) { + } config_set_special_field("tls-protocols") { + redisTLSContextConfig tmpctx = server.tls_ctx_config; + tmpctx.protocols = (char *) o->ptr; + if (tlsConfigure(&tmpctx) == C_ERR) { + addReplyError(c, + "Unable to configure tls-protocols. Check server logs."); + return; + } + zfree(server.tls_ctx_config.protocols); + server.tls_ctx_config.protocols = zstrdup(o->ptr); + } config_set_special_field("tls-ciphers") { + redisTLSContextConfig tmpctx = server.tls_ctx_config; + tmpctx.ciphers = (char *) o->ptr; + if (tlsConfigure(&tmpctx) == C_ERR) { + addReplyError(c, + "Unable to configure tls-ciphers. Check server logs."); + return; + } + zfree(server.tls_ctx_config.ciphers); + server.tls_ctx_config.ciphers = zstrdup(o->ptr); + } config_set_special_field("tls-ciphersuites") { + redisTLSContextConfig tmpctx = server.tls_ctx_config; + tmpctx.ciphersuites = (char *) o->ptr; + if (tlsConfigure(&tmpctx) == C_ERR) { + addReplyError(c, + "Unable to configure tls-ciphersuites. Check server logs."); + return; + } + zfree(server.tls_ctx_config.ciphersuites); + server.tls_ctx_config.ciphersuites = zstrdup(o->ptr); + } config_set_special_field("tls-prefer-server-ciphers") { + redisTLSContextConfig tmpctx = server.tls_ctx_config; + tmpctx.prefer_server_ciphers = yesnotoi(o->ptr); + if (tlsConfigure(&tmpctx) == C_ERR) { + addReplyError(c, "Unable to reconfigure TLS. Check server logs."); + return; + } + server.tls_ctx_config.prefer_server_ciphers = tmpctx.prefer_server_ciphers; +#endif /* USE_OPENSSL */ /* Everyhing else is an error... */ } config_set_else { addReplyErrorFormat(c,"Unsupported CONFIG parameter: %s", @@ -1313,6 +1453,16 @@ void configGetCommand(client *c) { config_get_string_field("pidfile",server.pidfile); config_get_string_field("slave-announce-ip",server.slave_announce_ip); config_get_string_field("replica-announce-ip",server.slave_announce_ip); +#ifdef USE_OPENSSL + config_get_string_field("tls-cert-file",server.tls_ctx_config.cert_file); + config_get_string_field("tls-key-file",server.tls_ctx_config.key_file); + config_get_string_field("tls-dh-params-file",server.tls_ctx_config.dh_params_file); + config_get_string_field("tls-ca-cert-file",server.tls_ctx_config.ca_cert_file); + config_get_string_field("tls-ca-cert-dir",server.tls_ctx_config.ca_cert_dir); + config_get_string_field("tls-protocols",server.tls_ctx_config.protocols); + config_get_string_field("tls-ciphers",server.tls_ctx_config.ciphers); + config_get_string_field("tls-ciphersuites",server.tls_ctx_config.ciphersuites); +#endif /* Numerical values */ config_get_numerical_field("maxmemory",server.maxmemory); @@ -1360,6 +1510,7 @@ void configGetCommand(client *c) { config_get_numerical_field("slowlog-max-len", server.slowlog_max_len); config_get_numerical_field("tracking-table-max-fill", server.tracking_table_max_fill); config_get_numerical_field("port",server.port); + config_get_numerical_field("tls-port",server.tls_port); config_get_numerical_field("cluster-announce-port",server.cluster_announce_port); config_get_numerical_field("cluster-announce-bus-port",server.cluster_announce_bus_port); config_get_numerical_field("tcp-backlog",server.tcp_backlog); @@ -1387,6 +1538,7 @@ void configGetCommand(client *c) { config_get_numerical_field("cluster-replica-validity-factor",server.cluster_slave_validity_factor); config_get_numerical_field("repl-diskless-sync-delay",server.repl_diskless_sync_delay); config_get_numerical_field("rdb-key-save-delay",server.rdb_key_save_delay); + config_get_numerical_field("key-load-delay",server.key_load_delay); config_get_numerical_field("tcp-keepalive",server.tcpkeepalive); /* Bool (yes/no) values */ @@ -1399,7 +1551,11 @@ void configGetCommand(client *c) { } config_get_bool_field("activedefrag", server.active_defrag_enabled); - + config_get_bool_field("tls-cluster",server.tls_cluster); + config_get_bool_field("tls-replication",server.tls_replication); + config_get_bool_field("tls-auth-clients",server.tls_auth_clients); + config_get_bool_field("tls-prefer-server-ciphers", + server.tls_ctx_config.prefer_server_ciphers); /* Enum values */ config_get_enum_field("maxmemory-policy", server.maxmemory_policy,maxmemory_policy_enum); @@ -1513,6 +1669,7 @@ void configGetCommand(client *c) { } matches++; } + setDeferredMapLen(c,replylen,matches); } @@ -2119,7 +2276,7 @@ int rewriteConfig(char *path) { } rewriteConfigStringOption(state,"pidfile",server.pidfile,CONFIG_DEFAULT_PID_FILE); - rewriteConfigNumericalOption(state,"port",server.port,CONFIG_DEFAULT_SERVER_PORT); + rewriteConfigNumericalOption(state,"tls-port",server.tls_port,CONFIG_DEFAULT_SERVER_TLS_PORT); rewriteConfigNumericalOption(state,"cluster-announce-port",server.cluster_announce_port,CONFIG_DEFAULT_CLUSTER_ANNOUNCE_PORT); rewriteConfigNumericalOption(state,"cluster-announce-bus-port",server.cluster_announce_bus_port,CONFIG_DEFAULT_CLUSTER_ANNOUNCE_BUS_PORT); rewriteConfigNumericalOption(state,"tcp-backlog",server.tcp_backlog,CONFIG_DEFAULT_TCP_BACKLOG); @@ -2201,6 +2358,21 @@ int rewriteConfig(char *path) { rewriteConfigNumericalOption(state,"hz",server.config_hz,CONFIG_DEFAULT_HZ); rewriteConfigEnumOption(state,"supervised",server.supervised_mode,supervised_mode_enum,SUPERVISED_NONE); rewriteConfigNumericalOption(state,"rdb-key-save-delay",server.rdb_key_save_delay,CONFIG_DEFAULT_RDB_KEY_SAVE_DELAY); + rewriteConfigNumericalOption(state,"key-load-delay",server.key_load_delay,CONFIG_DEFAULT_KEY_LOAD_DELAY); +#ifdef USE_OPENSSL + rewriteConfigYesNoOption(state,"tls-cluster",server.tls_cluster,0); + rewriteConfigYesNoOption(state,"tls-replication",server.tls_replication,0); + rewriteConfigYesNoOption(state,"tls-auth-clients",server.tls_auth_clients,1); + rewriteConfigStringOption(state,"tls-cert-file",server.tls_ctx_config.cert_file,NULL); + rewriteConfigStringOption(state,"tls-key-file",server.tls_ctx_config.key_file,NULL); + rewriteConfigStringOption(state,"tls-dh-params-file",server.tls_ctx_config.dh_params_file,NULL); + rewriteConfigStringOption(state,"tls-ca-cert-file",server.tls_ctx_config.ca_cert_file,NULL); + rewriteConfigStringOption(state,"tls-ca-cert-dir",server.tls_ctx_config.ca_cert_dir,NULL); + rewriteConfigStringOption(state,"tls-protocols",server.tls_ctx_config.protocols,NULL); + rewriteConfigStringOption(state,"tls-ciphers",server.tls_ctx_config.ciphers,NULL); + rewriteConfigStringOption(state,"tls-ciphersuites",server.tls_ctx_config.ciphersuites,NULL); + rewriteConfigYesNoOption(state,"tls-prefer-server-ciphers",server.tls_ctx_config.prefer_server_ciphers,0); +#endif /* Rewrite Sentinel config if in Sentinel mode. */ if (server.sentinel_mode) rewriteConfigSentinelOption(state); diff --git a/src/connection.c b/src/connection.c new file mode 100644 index 000000000..58d86c31b --- /dev/null +++ b/src/connection.c @@ -0,0 +1,407 @@ +/* + * Copyright (c) 2019, Redis Labs + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "server.h" +#include "connhelpers.h" + +/* The connections module provides a lean abstraction of network connections + * to avoid direct socket and async event management across the Redis code base. + * + * It does NOT provide advanced connection features commonly found in similar + * libraries such as complete in/out buffer management, throttling, etc. These + * functions remain in networking.c. + * + * The primary goal is to allow transparent handling of TCP and TLS based + * connections. To do so, connections have the following properties: + * + * 1. A connection may live before its corresponding socket exists. This + * allows various context and configuration setting to be handled before + * establishing the actual connection. + * 2. The caller may register/unregister logical read/write handlers to be + * called when the connection has data to read from/can accept writes. + * These logical handlers may or may not correspond to actual AE events, + * depending on the implementation (for TCP they are; for TLS they aren't). + */ + +ConnectionType CT_Socket; + +/* When a connection is created we must know its type already, but the + * underlying socket may or may not exist: + * + * - For accepted connections, it exists as we do not model the listen/accept + * part; So caller calls connCreateSocket() followed by connAccept(). + * - For outgoing connections, the socket is created by the connection module + * itself; So caller calls connCreateSocket() followed by connConnect(), + * which registers a connect callback that fires on connected/error state + * (and after any transport level handshake was done). + * + * NOTE: An earlier version relied on connections being part of other structs + * and not independently allocated. This could lead to further optimizations + * like using container_of(), etc. However it was discontinued in favor of + * this approach for these reasons: + * + * 1. In some cases conns are created/handled outside the context of the + * containing struct, in which case it gets a bit awkward to copy them. + * 2. Future implementations may wish to allocate arbitrary data for the + * connection. + * 3. The container_of() approach is anyway risky because connections may + * be embedded in different structs, not just client. + */ + +connection *connCreateSocket() { + connection *conn = zcalloc(sizeof(connection)); + conn->type = &CT_Socket; + conn->fd = -1; + + return conn; +} + +/* Create a new socket-type connection that is already associated with + * an accepted connection. + * + * The socket is not read for I/O until connAccept() was called and + * invoked the connection-level accept handler. + */ +connection *connCreateAcceptedSocket(int fd) { + connection *conn = connCreateSocket(); + conn->fd = fd; + conn->state = CONN_STATE_ACCEPTING; + return conn; +} + +static int connSocketConnect(connection *conn, const char *addr, int port, const char *src_addr, + ConnectionCallbackFunc connect_handler) { + int fd = anetTcpNonBlockBestEffortBindConnect(NULL,addr,port,src_addr); + if (fd == -1) { + conn->state = CONN_STATE_ERROR; + conn->last_errno = errno; + return C_ERR; + } + + conn->fd = fd; + conn->state = CONN_STATE_CONNECTING; + + conn->conn_handler = connect_handler; + aeCreateFileEvent(server.el, conn->fd, AE_WRITABLE, + conn->type->ae_handler, conn); + + return C_OK; +} + +/* Returns true if a write handler is registered */ +int connHasWriteHandler(connection *conn) { + return conn->write_handler != NULL; +} + +/* Returns true if a read handler is registered */ +int connHasReadHandler(connection *conn) { + return conn->read_handler != NULL; +} + +/* Associate a private data pointer with the connection */ +void connSetPrivateData(connection *conn, void *data) { + conn->private_data = data; +} + +/* Get the associated private data pointer */ +void *connGetPrivateData(connection *conn) { + return conn->private_data; +} + +/* ------ Pure socket connections ------- */ + +/* A very incomplete list of implementation-specific calls. Much of the above shall + * move here as we implement additional connection types. + */ + +/* Close the connection and free resources. */ +static void connSocketClose(connection *conn) { + if (conn->fd != -1) { + aeDeleteFileEvent(server.el,conn->fd,AE_READABLE); + aeDeleteFileEvent(server.el,conn->fd,AE_WRITABLE); + close(conn->fd); + conn->fd = -1; + } + + /* If called from within a handler, schedule the close but + * keep the connection until the handler returns. + */ + if (conn->flags & CONN_FLAG_IN_HANDLER) { + conn->flags |= CONN_FLAG_CLOSE_SCHEDULED; + return; + } + + zfree(conn); +} + +static int connSocketWrite(connection *conn, const void *data, size_t data_len) { + int ret = write(conn->fd, data, data_len); + if (ret < 0 && errno != EAGAIN) { + conn->last_errno = errno; + conn->state = CONN_STATE_ERROR; + } + + return ret; +} + +static int connSocketRead(connection *conn, void *buf, size_t buf_len) { + int ret = read(conn->fd, buf, buf_len); + if (!ret) { + conn->state = CONN_STATE_CLOSED; + } else if (ret < 0 && errno != EAGAIN) { + conn->last_errno = errno; + conn->state = CONN_STATE_ERROR; + } + + return ret; +} + +static int connSocketAccept(connection *conn, ConnectionCallbackFunc accept_handler) { + if (conn->state != CONN_STATE_ACCEPTING) return C_ERR; + conn->state = CONN_STATE_CONNECTED; + if (!callHandler(conn, accept_handler)) return C_ERR; + return C_OK; +} + +/* Register a write handler, to be called when the connection is writable. + * If NULL, the existing handler is removed. + * + * The barrier flag indicates a write barrier is requested, resulting with + * CONN_FLAG_WRITE_BARRIER set. This will ensure that the write handler is + * always called before and not after the read handler in a single event + * loop. + */ +static int connSocketSetWriteHandler(connection *conn, ConnectionCallbackFunc func, int barrier) { + if (func == conn->write_handler) return C_OK; + + conn->write_handler = func; + if (barrier) + conn->flags |= CONN_FLAG_WRITE_BARRIER; + else + conn->flags &= ~CONN_FLAG_WRITE_BARRIER; + if (!conn->write_handler) + aeDeleteFileEvent(server.el,conn->fd,AE_WRITABLE); + else + if (aeCreateFileEvent(server.el,conn->fd,AE_WRITABLE, + conn->type->ae_handler,conn) == AE_ERR) return C_ERR; + return C_OK; +} + +/* Register a read handler, to be called when the connection is readable. + * If NULL, the existing handler is removed. + */ +static int connSocketSetReadHandler(connection *conn, ConnectionCallbackFunc func) { + if (func == conn->read_handler) return C_OK; + + conn->read_handler = func; + if (!conn->read_handler) + aeDeleteFileEvent(server.el,conn->fd,AE_READABLE); + else + if (aeCreateFileEvent(server.el,conn->fd, + AE_READABLE,conn->type->ae_handler,conn) == AE_ERR) return C_ERR; + return C_OK; +} + +static const char *connSocketGetLastError(connection *conn) { + return strerror(conn->last_errno); +} + +static void connSocketEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask) +{ + UNUSED(el); + UNUSED(fd); + connection *conn = clientData; + + if (conn->state == CONN_STATE_CONNECTING && + (mask & AE_WRITABLE) && conn->conn_handler) { + + if (connGetSocketError(conn)) { + conn->last_errno = errno; + conn->state = CONN_STATE_ERROR; + } else { + conn->state = CONN_STATE_CONNECTED; + } + + if (!conn->write_handler) aeDeleteFileEvent(server.el,conn->fd,AE_WRITABLE); + + if (!callHandler(conn, conn->conn_handler)) return; + conn->conn_handler = NULL; + } + + /* Normally we execute the readable event first, and the writable + * event later. This is useful as sometimes we may be able + * to serve the reply of a query immediately after processing the + * query. + * + * However if WRITE_BARRIER is set in the mask, our application is + * asking us to do the reverse: never fire the writable event + * after the readable. In such a case, we invert the calls. + * This is useful when, for instance, we want to do things + * in the beforeSleep() hook, like fsync'ing a file to disk, + * before replying to a client. */ + int invert = conn->flags & CONN_FLAG_WRITE_BARRIER; + + int call_write = (mask & AE_WRITABLE) && conn->write_handler; + int call_read = (mask & AE_READABLE) && conn->read_handler; + + /* Handle normal I/O flows */ + if (!invert && call_read) { + if (!callHandler(conn, conn->read_handler)) return; + } + /* Fire the writable event. */ + if (call_write) { + if (!callHandler(conn, conn->write_handler)) return; + } + /* If we have to invert the call, fire the readable event now + * after the writable one. */ + if (invert && call_read) { + if (!callHandler(conn, conn->read_handler)) return; + } +} + +static int connSocketBlockingConnect(connection *conn, const char *addr, int port, long long timeout) { + int fd = anetTcpNonBlockConnect(NULL,addr,port); + if (fd == -1) { + conn->state = CONN_STATE_ERROR; + conn->last_errno = errno; + return C_ERR; + } + + if ((aeWait(fd, AE_WRITABLE, timeout) & AE_WRITABLE) == 0) { + conn->state = CONN_STATE_ERROR; + conn->last_errno = ETIMEDOUT; + } + + conn->fd = fd; + conn->state = CONN_STATE_CONNECTED; + return C_OK; +} + +/* Connection-based versions of syncio.c functions. + * NOTE: This should ideally be refactored out in favor of pure async work. + */ + +static ssize_t connSocketSyncWrite(connection *conn, char *ptr, ssize_t size, long long timeout) { + return syncWrite(conn->fd, ptr, size, timeout); +} + +static ssize_t connSocketSyncRead(connection *conn, char *ptr, ssize_t size, long long timeout) { + return syncRead(conn->fd, ptr, size, timeout); +} + +static ssize_t connSocketSyncReadLine(connection *conn, char *ptr, ssize_t size, long long timeout) { + return syncReadLine(conn->fd, ptr, size, timeout); +} + + +ConnectionType CT_Socket = { + .ae_handler = connSocketEventHandler, + .close = connSocketClose, + .write = connSocketWrite, + .read = connSocketRead, + .accept = connSocketAccept, + .connect = connSocketConnect, + .set_write_handler = connSocketSetWriteHandler, + .set_read_handler = connSocketSetReadHandler, + .get_last_error = connSocketGetLastError, + .blocking_connect = connSocketBlockingConnect, + .sync_write = connSocketSyncWrite, + .sync_read = connSocketSyncRead, + .sync_readline = connSocketSyncReadLine +}; + + +int connGetSocketError(connection *conn) { + int sockerr = 0; + socklen_t errlen = sizeof(sockerr); + + if (getsockopt(conn->fd, SOL_SOCKET, SO_ERROR, &sockerr, &errlen) == -1) + sockerr = errno; + return sockerr; +} + +int connPeerToString(connection *conn, char *ip, size_t ip_len, int *port) { + return anetPeerToString(conn ? conn->fd : -1, ip, ip_len, port); +} + +int connFormatPeer(connection *conn, char *buf, size_t buf_len) { + return anetFormatPeer(conn ? conn->fd : -1, buf, buf_len); +} + +int connSockName(connection *conn, char *ip, size_t ip_len, int *port) { + return anetSockName(conn->fd, ip, ip_len, port); +} + +int connBlock(connection *conn) { + if (conn->fd == -1) return C_ERR; + return anetBlock(NULL, conn->fd); +} + +int connNonBlock(connection *conn) { + if (conn->fd == -1) return C_ERR; + return anetNonBlock(NULL, conn->fd); +} + +int connEnableTcpNoDelay(connection *conn) { + if (conn->fd == -1) return C_ERR; + return anetEnableTcpNoDelay(NULL, conn->fd); +} + +int connDisableTcpNoDelay(connection *conn) { + if (conn->fd == -1) return C_ERR; + return anetDisableTcpNoDelay(NULL, conn->fd); +} + +int connKeepAlive(connection *conn, int interval) { + if (conn->fd == -1) return C_ERR; + return anetKeepAlive(NULL, conn->fd, interval); +} + +int connSendTimeout(connection *conn, long long ms) { + return anetSendTimeout(NULL, conn->fd, ms); +} + +int connRecvTimeout(connection *conn, long long ms) { + return anetRecvTimeout(NULL, conn->fd, ms); +} + +int connGetState(connection *conn) { + return conn->state; +} + +/* Return a text that describes the connection, suitable for inclusion + * in CLIENT LIST and similar outputs. + * + * For sockets, we always return "fd=" to maintain compatibility. + */ +const char *connGetInfo(connection *conn, char *buf, size_t buf_len) { + snprintf(buf, buf_len-1, "fd=%i", conn->fd); + return buf; +} + diff --git a/src/connection.h b/src/connection.h new file mode 100644 index 000000000..97622f8d6 --- /dev/null +++ b/src/connection.h @@ -0,0 +1,220 @@ + +/* + * Copyright (c) 2019, Redis Labs + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __REDIS_CONNECTION_H +#define __REDIS_CONNECTION_H + +#define CONN_INFO_LEN 32 + +struct aeEventLoop; +typedef struct connection connection; + +typedef enum { + CONN_STATE_NONE = 0, + CONN_STATE_CONNECTING, + CONN_STATE_ACCEPTING, + CONN_STATE_CONNECTED, + CONN_STATE_CLOSED, + CONN_STATE_ERROR +} ConnectionState; + +#define CONN_FLAG_IN_HANDLER (1<<0) /* A handler execution is in progress */ +#define CONN_FLAG_CLOSE_SCHEDULED (1<<1) /* Closed scheduled by a handler */ +#define CONN_FLAG_WRITE_BARRIER (1<<2) /* Write barrier requested */ + +typedef void (*ConnectionCallbackFunc)(struct connection *conn); + +typedef struct ConnectionType { + void (*ae_handler)(struct aeEventLoop *el, int fd, void *clientData, int mask); + int (*connect)(struct connection *conn, const char *addr, int port, const char *source_addr, ConnectionCallbackFunc connect_handler); + int (*write)(struct connection *conn, const void *data, size_t data_len); + int (*read)(struct connection *conn, void *buf, size_t buf_len); + void (*close)(struct connection *conn); + int (*accept)(struct connection *conn, ConnectionCallbackFunc accept_handler); + int (*set_write_handler)(struct connection *conn, ConnectionCallbackFunc handler, int barrier); + int (*set_read_handler)(struct connection *conn, ConnectionCallbackFunc handler); + const char *(*get_last_error)(struct connection *conn); + int (*blocking_connect)(struct connection *conn, const char *addr, int port, long long timeout); + ssize_t (*sync_write)(struct connection *conn, char *ptr, ssize_t size, long long timeout); + ssize_t (*sync_read)(struct connection *conn, char *ptr, ssize_t size, long long timeout); + ssize_t (*sync_readline)(struct connection *conn, char *ptr, ssize_t size, long long timeout); +} ConnectionType; + +struct connection { + ConnectionType *type; + ConnectionState state; + int flags; + int last_errno; + void *private_data; + ConnectionCallbackFunc conn_handler; + ConnectionCallbackFunc write_handler; + ConnectionCallbackFunc read_handler; + int fd; +}; + +/* The connection module does not deal with listening and accepting sockets, + * so we assume we have a socket when an incoming connection is created. + * + * The fd supplied should therefore be associated with an already accept()ed + * socket. + * + * connAccept() may directly call accept_handler(), or return and call it + * at a later time. This behavior is a bit awkward but aims to reduce the need + * to wait for the next event loop, if no additional handshake is required. + */ + +static inline int connAccept(connection *conn, ConnectionCallbackFunc accept_handler) { + return conn->type->accept(conn, accept_handler); +} + +/* Establish a connection. The connect_handler will be called when the connection + * is established, or if an error has occured. + * + * The connection handler will be responsible to set up any read/write handlers + * as needed. + * + * If C_ERR is returned, the operation failed and the connection handler shall + * not be expected. + */ +static inline int connConnect(connection *conn, const char *addr, int port, const char *src_addr, + ConnectionCallbackFunc connect_handler) { + return conn->type->connect(conn, addr, port, src_addr, connect_handler); +} + +/* Blocking connect. + * + * NOTE: This is implemented in order to simplify the transition to the abstract + * connections, but should probably be refactored out of cluster.c and replication.c, + * in favor of a pure async implementation. + */ +static inline int connBlockingConnect(connection *conn, const char *addr, int port, long long timeout) { + return conn->type->blocking_connect(conn, addr, port, timeout); +} + +/* Write to connection, behaves the same as write(2). + * + * Like write(2), a short write is possible. A -1 return indicates an error. + * + * The caller should NOT rely on errno. Testing for an EAGAIN-like condition, use + * connGetState() to see if the connection state is still CONN_STATE_CONNECTED. + */ +static inline int connWrite(connection *conn, const void *data, size_t data_len) { + return conn->type->write(conn, data, data_len); +} + +/* Read from the connection, behaves the same as read(2). + * + * Like read(2), a short read is possible. A return value of 0 will indicate the + * connection was closed, and -1 will indicate an error. + * + * The caller should NOT rely on errno. Testing for an EAGAIN-like condition, use + * connGetState() to see if the connection state is still CONN_STATE_CONNECTED. + */ +static inline int connRead(connection *conn, void *buf, size_t buf_len) { + return conn->type->read(conn, buf, buf_len); +} + +/* Register a write handler, to be called when the connection is writable. + * If NULL, the existing handler is removed. + */ +static inline int connSetWriteHandler(connection *conn, ConnectionCallbackFunc func) { + return conn->type->set_write_handler(conn, func, 0); +} + +/* Register a read handler, to be called when the connection is readable. + * If NULL, the existing handler is removed. + */ +static inline int connSetReadHandler(connection *conn, ConnectionCallbackFunc func) { + return conn->type->set_read_handler(conn, func); +} + +/* Set a write handler, and possibly enable a write barrier, this flag is + * cleared when write handler is changed or removed. + * With barroer enabled, we never fire the event if the read handler already + * fired in the same event loop iteration. Useful when you want to persist + * things to disk before sending replies, and want to do that in a group fashion. */ +static inline int connSetWriteHandlerWithBarrier(connection *conn, ConnectionCallbackFunc func, int barrier) { + return conn->type->set_write_handler(conn, func, barrier); +} + +static inline void connClose(connection *conn) { + conn->type->close(conn); +} + +/* Returns the last error encountered by the connection, as a string. If no error, + * a NULL is returned. + */ +static inline const char *connGetLastError(connection *conn) { + return conn->type->get_last_error(conn); +} + +static inline ssize_t connSyncWrite(connection *conn, char *ptr, ssize_t size, long long timeout) { + return conn->type->sync_write(conn, ptr, size, timeout); +} + +static inline ssize_t connSyncRead(connection *conn, char *ptr, ssize_t size, long long timeout) { + return conn->type->sync_read(conn, ptr, size, timeout); +} + +static inline ssize_t connSyncReadLine(connection *conn, char *ptr, ssize_t size, long long timeout) { + return conn->type->sync_readline(conn, ptr, size, timeout); +} + +connection *connCreateSocket(); +connection *connCreateAcceptedSocket(int fd); + +connection *connCreateTLS(); +connection *connCreateAcceptedTLS(int fd, int require_auth); + +void connSetPrivateData(connection *conn, void *data); +void *connGetPrivateData(connection *conn); +int connGetState(connection *conn); +int connHasWriteHandler(connection *conn); +int connHasReadHandler(connection *conn); +int connGetSocketError(connection *conn); + +/* anet-style wrappers to conns */ +int connBlock(connection *conn); +int connNonBlock(connection *conn); +int connEnableTcpNoDelay(connection *conn); +int connDisableTcpNoDelay(connection *conn); +int connKeepAlive(connection *conn, int interval); +int connSendTimeout(connection *conn, long long ms); +int connRecvTimeout(connection *conn, long long ms); +int connPeerToString(connection *conn, char *ip, size_t ip_len, int *port); +int connFormatPeer(connection *conn, char *buf, size_t buf_len); +int connSockName(connection *conn, char *ip, size_t ip_len, int *port); +const char *connGetInfo(connection *conn, char *buf, size_t buf_len); + +/* Helpers for tls special considerations */ +int tlsHasPendingData(); +void tlsProcessPendingData(); + +#endif /* __REDIS_CONNECTION_H */ diff --git a/src/connhelpers.h b/src/connhelpers.h new file mode 100644 index 000000000..f237c9b1d --- /dev/null +++ b/src/connhelpers.h @@ -0,0 +1,85 @@ + +/* + * Copyright (c) 2019, Redis Labs + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __REDIS_CONNHELPERS_H +#define __REDIS_CONNHELPERS_H + +#include "connection.h" + +/* These are helper functions that are common to different connection + * implementations (currently sockets in connection.c and TLS in tls.c). + * + * Currently helpers implement the mechanisms for invoking connection + * handlers, tracking in-handler states and dealing with deferred + * destruction (if invoked by a handler). + */ + +/* Called whenever a handler is invoked on a connection and sets the + * CONN_FLAG_IN_HANDLER flag to indicate we're in a handler context. + * + * An attempt to close a connection while CONN_FLAG_IN_HANDLER is + * set will result with deferred close, i.e. setting the CONN_FLAG_CLOSE_SCHEDULED + * instead of destructing it. + */ +static inline void enterHandler(connection *conn) { + conn->flags |= CONN_FLAG_IN_HANDLER; +} + +/* Called whenever a handler returns. This unsets the CONN_FLAG_IN_HANDLER + * flag and performs actual close/destruction if a deferred close was + * scheduled by the handler. + */ +static inline int exitHandler(connection *conn) { + conn->flags &= ~CONN_FLAG_IN_HANDLER; + if (conn->flags & CONN_FLAG_CLOSE_SCHEDULED) { + connClose(conn); + return 0; + } + return 1; +} + +/* Helper for connection implementations to call handlers: + * 1. Mark the handler in use. + * 2. Execute the handler (if set). + * 3. Mark the handler as NOT in use and perform deferred close if was + * requested by the handler at any time. + */ +static inline int callHandler(connection *conn, ConnectionCallbackFunc handler) { + conn->flags |= CONN_FLAG_IN_HANDLER; + if (handler) handler(conn); + conn->flags &= ~CONN_FLAG_IN_HANDLER; + if (conn->flags & CONN_FLAG_CLOSE_SCHEDULED) { + connClose(conn); + return 0; + } + return 1; +} + +#endif /* __REDIS_CONNHELPERS_H */ diff --git a/src/debug.c b/src/debug.c index 29a244e24..179f6d2c9 100644 --- a/src/debug.c +++ b/src/debug.c @@ -369,6 +369,7 @@ void debugCommand(client *c) { "SDSLEN -- Show low level SDS string info representing key and value.", "SEGFAULT -- Crash the server with sigsegv.", "SET-ACTIVE-EXPIRE <0|1> -- Setting it to 0 disables expiring keys in background when they are not accessed (otherwise the Redis behavior). Setting it to 1 reenables back the default.", +"AOF-FLUSH-SLEEP -- Server will sleep before flushing the AOF, this is used for testing", "SLEEP -- Stop the server for . Decimals allowed.", "STRUCTSIZE -- Return the size of different Redis core C structures.", "ZIPLIST -- Show low level info about the ziplist encoding.", @@ -649,6 +650,11 @@ NULL { server.active_expire_enabled = atoi(c->argv[2]->ptr); addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"aof-flush-sleep") && + c->argc == 3) + { + server.aof_flush_sleep = atoi(c->argv[2]->ptr); + addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"lua-always-replicate-commands") && c->argc == 3) { @@ -762,11 +768,12 @@ void _serverAssert(const char *estr, const char *file, int line) { void _serverAssertPrintClientInfo(const client *c) { int j; + char conninfo[CONN_INFO_LEN]; bugReportStart(); serverLog(LL_WARNING,"=== ASSERTION FAILED CLIENT CONTEXT ==="); - serverLog(LL_WARNING,"client->flags = %llu", (unsigned long long)c->flags); - serverLog(LL_WARNING,"client->fd = %d", c->fd); + serverLog(LL_WARNING,"client->flags = %llu", (unsigned long long) c->flags); + serverLog(LL_WARNING,"client->conn = %s", connGetInfo(c->conn, conninfo, sizeof(conninfo))); serverLog(LL_WARNING,"client->argc = %d", c->argc); for (j=0; j < c->argc; j++) { char buf[128]; diff --git a/src/module.c b/src/module.c index 7e0a419b6..0d83817fa 100644 --- a/src/module.c +++ b/src/module.c @@ -2872,7 +2872,7 @@ RedisModuleCallReply *RM_Call(RedisModuleCtx *ctx, const char *cmdname, const ch /* Create the client and dispatch the command. */ va_start(ap, fmt); - c = createClient(-1); + c = createClient(NULL); c->user = NULL; /* Root user. */ argv = moduleCreateArgvFromUserFormat(cmdname,fmt,&argc,&flags,ap); replicate = flags & REDISMODULE_ARGV_REPLICATE; @@ -3836,7 +3836,7 @@ RedisModuleBlockedClient *RM_BlockClient(RedisModuleCtx *ctx, RedisModuleCmdFunc bc->disconnect_callback = NULL; /* Set by RM_SetDisconnectCallback() */ bc->free_privdata = free_privdata; bc->privdata = NULL; - bc->reply_client = createClient(-1); + bc->reply_client = createClient(NULL); bc->reply_client->flags |= CLIENT_MODULE; bc->dbid = c->db->id; c->bpop.timeout = timeout_ms ? (mstime()+timeout_ms) : 0; @@ -4077,7 +4077,7 @@ RedisModuleCtx *RM_GetThreadSafeContext(RedisModuleBlockedClient *bc) { * access it safely from another thread, so we create a fake client here * in order to keep things like the currently selected database and similar * things. */ - ctx->client = createClient(-1); + ctx->client = createClient(NULL); if (bc) { selectDb(ctx->client,bc->dbid); ctx->client->id = bc->client->id; @@ -5552,7 +5552,7 @@ void moduleInitModulesSystem(void) { /* Set up the keyspace notification susbscriber list and static client */ moduleKeyspaceSubscribers = listCreate(); - moduleFreeContextReusedClient = createClient(-1); + moduleFreeContextReusedClient = createClient(NULL); moduleFreeContextReusedClient->flags |= CLIENT_MODULE; moduleFreeContextReusedClient->user = NULL; /* root user. */ diff --git a/src/networking.c b/src/networking.c index a959d557a..ddfe4d8e3 100644 --- a/src/networking.c +++ b/src/networking.c @@ -84,32 +84,27 @@ void linkClient(client *c) { raxInsert(server.clients_index,(unsigned char*)&id,sizeof(id),c,NULL); } -client *createClient(int fd) { +client *createClient(connection *conn) { client *c = zmalloc(sizeof(client)); - /* passing -1 as fd it is possible to create a non connected client. + /* passing NULL as conn it is possible to create a non connected client. * This is useful since all the commands needs to be executed * in the context of a client. When commands are executed in other * contexts (for instance a Lua script) we need a non connected client. */ - if (fd != -1) { - anetNonBlock(NULL,fd); - anetEnableTcpNoDelay(NULL,fd); + if (conn) { + connNonBlock(conn); + connEnableTcpNoDelay(conn); if (server.tcpkeepalive) - anetKeepAlive(NULL,fd,server.tcpkeepalive); - if (aeCreateFileEvent(server.el,fd,AE_READABLE, - readQueryFromClient, c) == AE_ERR) - { - close(fd); - zfree(c); - return NULL; - } + connKeepAlive(conn,server.tcpkeepalive); + connSetReadHandler(conn, readQueryFromClient); + connSetPrivateData(conn, c); } selectDb(c,0); uint64_t client_id = ++server.next_client_id; c->id = client_id; c->resp = 2; - c->fd = fd; + c->conn = conn; c->name = NULL; c->bufpos = 0; c->qb_pos = 0; @@ -161,7 +156,7 @@ client *createClient(int fd) { c->client_tracking_redirection = 0; listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid); listSetMatchMethod(c->pubsub_patterns,listMatchObjects); - if (fd != -1) linkClient(c); + if (conn) linkClient(c); initClientMultiState(c); return c; } @@ -227,7 +222,7 @@ int prepareClientToWrite(client *c) { if ((c->flags & CLIENT_MASTER) && !(c->flags & CLIENT_MASTER_FORCE_REPLY)) return C_ERR; - if (c->fd <= 0) return C_ERR; /* Fake client for AOF loading. */ + if (!c->conn) return C_ERR; /* Fake client for AOF loading. */ /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ @@ -777,28 +772,13 @@ int clientHasPendingReplies(client *c) { return c->bufpos || listLength(c->reply); } -#define MAX_ACCEPTS_PER_CALL 1000 -static void acceptCommonHandler(int fd, int flags, char *ip) { - client *c; - if ((c = createClient(fd)) == NULL) { - serverLog(LL_WARNING, - "Error registering fd event for the new client: %s (fd=%d)", - strerror(errno),fd); - close(fd); /* May be already closed, just ignore errors */ - return; - } - /* If maxclient directive is set and this is one client more... close the - * connection. Note that we create the client instead to check before - * for this condition, since now the socket is already set in non-blocking - * mode and we can send an error for free using the Kernel I/O */ - if (listLength(server.clients) > server.maxclients) { - char *err = "-ERR max number of clients reached\r\n"; +void clientAcceptHandler(connection *conn) { + client *c = connGetPrivateData(conn); - /* That's a best effort error message, don't check write errors */ - if (write(c->fd,err,strlen(err)) == -1) { - /* Nothing to do, Just to avoid the warning... */ - } - server.stat_rejected_conn++; + if (connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_WARNING, + "Error accepting a client connection: %s", + connGetLastError(conn)); freeClient(c); return; } @@ -810,10 +790,12 @@ static void acceptCommonHandler(int fd, int flags, char *ip) { if (server.protected_mode && server.bindaddr_count == 0 && DefaultUser->flags & USER_FLAG_NOPASS && - !(flags & CLIENT_UNIX_SOCKET) && - ip != NULL) + !(c->flags & CLIENT_UNIX_SOCKET)) { - if (strcmp(ip,"127.0.0.1") && strcmp(ip,"::1")) { + char cip[NET_IP_STR_LEN+1] = { 0 }; + connPeerToString(conn, cip, sizeof(cip)-1, NULL); + + if (strcmp(cip,"127.0.0.1") && strcmp(cip,"::1")) { char *err = "-DENIED Redis is running in protected mode because protected " "mode is enabled, no bind address was specified, no " @@ -835,7 +817,7 @@ static void acceptCommonHandler(int fd, int flags, char *ip) { "4) Setup a bind address or an authentication password. " "NOTE: You only need to do one of the above things in order for " "the server to start accepting connections from the outside.\r\n"; - if (write(c->fd,err,strlen(err)) == -1) { + if (connWrite(c->conn,err,strlen(err)) == -1) { /* Nothing to do, Just to avoid the warning... */ } server.stat_rejected_conn++; @@ -845,7 +827,63 @@ static void acceptCommonHandler(int fd, int flags, char *ip) { } server.stat_numconnections++; +} + + +#define MAX_ACCEPTS_PER_CALL 1000 +static void acceptCommonHandler(connection *conn, int flags, char *ip) { + client *c; + UNUSED(ip); + + /* Admission control will happen before a client is created and connAccept() + * called, because we don't want to even start transport-level negotiation + * if rejected. + */ + if (listLength(server.clients) >= server.maxclients) { + char *err = "-ERR max number of clients reached\r\n"; + + /* That's a best effort error message, don't check write errors. + * Note that for TLS connections, no handshake was done yet so nothing is written + * and the connection will just drop. + */ + if (connWrite(conn,err,strlen(err)) == -1) { + /* Nothing to do, Just to avoid the warning... */ + } + server.stat_rejected_conn++; + connClose(conn); + return; + } + + /* Create connection and client */ + if ((c = createClient(conn)) == NULL) { + char conninfo[100]; + serverLog(LL_WARNING, + "Error registering fd event for the new client: %s (conn: %s)", + connGetLastError(conn), + connGetInfo(conn, conninfo, sizeof(conninfo))); + connClose(conn); /* May be already closed, just ignore errors */ + return; + } + + /* Last chance to keep flags */ c->flags |= flags; + + /* Initiate accept. + * + * Note that connAccept() is free to do two things here: + * 1. Call clientAcceptHandler() immediately; + * 2. Schedule a future call to clientAcceptHandler(). + * + * Because of that, we must do nothing else afterwards. + */ + if (connAccept(conn, clientAcceptHandler) == C_ERR) { + char conninfo[100]; + serverLog(LL_WARNING, + "Error accepting a client connection: %s (conn: %s)", + connGetLastError(conn), connGetInfo(conn, conninfo, sizeof(conninfo))); + freeClient(connGetPrivateData(conn)); + return; + } } void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) { @@ -864,7 +902,27 @@ void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) { return; } serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport); - acceptCommonHandler(cfd,0,cip); + acceptCommonHandler(connCreateAcceptedSocket(cfd),0,cip); + } +} + +void acceptTLSHandler(aeEventLoop *el, int fd, void *privdata, int mask) { + int cport, cfd, max = MAX_ACCEPTS_PER_CALL; + char cip[NET_IP_STR_LEN]; + UNUSED(el); + UNUSED(mask); + UNUSED(privdata); + + while(max--) { + cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); + if (cfd == ANET_ERR) { + if (errno != EWOULDBLOCK) + serverLog(LL_WARNING, + "Accepting client connection: %s", server.neterr); + return; + } + serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport); + acceptCommonHandler(connCreateAcceptedTLS(cfd, server.tls_auth_clients),0,cip); } } @@ -883,7 +941,7 @@ void acceptUnixHandler(aeEventLoop *el, int fd, void *privdata, int mask) { return; } serverLog(LL_VERBOSE,"Accepted connection to %s", server.unixsocket); - acceptCommonHandler(cfd,CLIENT_UNIX_SOCKET,NULL); + acceptCommonHandler(connCreateAcceptedSocket(cfd),CLIENT_UNIX_SOCKET,NULL); } } @@ -914,10 +972,10 @@ void unlinkClient(client *c) { /* If this is marked as current client unset it. */ if (server.current_client == c) server.current_client = NULL; - /* Certain operations must be done only if the client has an active socket. + /* Certain operations must be done only if the client has an active connection. * If the client was already unlinked or if it's a "fake client" the - * fd is already set to -1. */ - if (c->fd != -1) { + * conn is already set to NULL. */ + if (c->conn) { /* Remove from the list of active clients. */ if (c->client_list_node) { uint64_t id = htonu64(c->id); @@ -926,21 +984,23 @@ void unlinkClient(client *c) { c->client_list_node = NULL; } - /* In the case of diskless replication the fork is writing to the - * sockets and just closing the fd isn't enough, if we don't also - * shutdown the socket the fork will continue to write to the slave - * and the salve will only find out that it was disconnected when - * it will finish reading the rdb. */ - if ((c->flags & CLIENT_SLAVE) && - (c->replstate == SLAVE_STATE_WAIT_BGSAVE_END)) { - shutdown(c->fd, SHUT_RDWR); + /* Check if this is a replica waiting for diskless replication (rdb pipe), + * in which case it needs to be cleaned from that list */ + if (c->flags & CLIENT_SLAVE && + c->replstate == SLAVE_STATE_WAIT_BGSAVE_END && + server.rdb_pipe_conns) + { + int i; + for (i=0; i < server.rdb_pipe_numconns; i++) { + if (server.rdb_pipe_conns[i] == c->conn) { + rdbPipeWriteHandlerConnRemoved(c->conn); + server.rdb_pipe_conns[i] = NULL; + break; + } + } } - - /* Unregister async I/O handlers and close the socket. */ - aeDeleteFileEvent(server.el,c->fd,AE_READABLE); - aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); - close(c->fd); - c->fd = -1; + connClose(c->conn); + c->conn = NULL; } /* Remove from the list of pending writes if needed. */ @@ -1112,19 +1172,20 @@ client *lookupClientByID(uint64_t id) { /* Write data in output buffers to client. Return C_OK if the client * is still valid after the call, C_ERR if it was freed because of some - * error. + * error. If handler_installed is set, it will attempt to clear the + * write event. * * This function is called by threads, but always with handler_installed * set to 0. So when handler_installed is set to 0 the function must be * thread safe. */ -int writeToClient(int fd, client *c, int handler_installed) { +int writeToClient(client *c, int handler_installed) { ssize_t nwritten = 0, totwritten = 0; size_t objlen; clientReplyBlock *o; while(clientHasPendingReplies(c)) { if (c->bufpos > 0) { - nwritten = write(fd,c->buf+c->sentlen,c->bufpos-c->sentlen); + nwritten = connWrite(c->conn,c->buf+c->sentlen,c->bufpos-c->sentlen); if (nwritten <= 0) break; c->sentlen += nwritten; totwritten += nwritten; @@ -1145,7 +1206,7 @@ int writeToClient(int fd, client *c, int handler_installed) { continue; } - nwritten = write(fd, o->buf + c->sentlen, objlen - c->sentlen); + nwritten = connWrite(c->conn, o->buf + c->sentlen, objlen - c->sentlen); if (nwritten <= 0) break; c->sentlen += nwritten; totwritten += nwritten; @@ -1180,11 +1241,11 @@ int writeToClient(int fd, client *c, int handler_installed) { } server.stat_net_output_bytes += totwritten; if (nwritten == -1) { - if (errno == EAGAIN) { + if (connGetState(c->conn) == CONN_STATE_CONNECTED) { nwritten = 0; } else { serverLog(LL_VERBOSE, - "Error writing to client: %s", strerror(errno)); + "Error writing to client: %s", connGetLastError(c->conn)); freeClientAsync(c); return C_ERR; } @@ -1202,7 +1263,7 @@ int writeToClient(int fd, client *c, int handler_installed) { * adDeleteFileEvent() is not thread safe: however writeToClient() * is always called with handler_installed set to 0 from threads * so we are fine. */ - if (handler_installed) aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + if (handler_installed) connSetWriteHandler(c->conn, NULL); /* Close connection after entire reply has been sent. */ if (c->flags & CLIENT_CLOSE_AFTER_REPLY) { @@ -1214,10 +1275,9 @@ int writeToClient(int fd, client *c, int handler_installed) { } /* Write event handler. Just send data to the client. */ -void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { - UNUSED(el); - UNUSED(mask); - writeToClient(fd,privdata,1); +void sendReplyToClient(connection *conn) { + client *c = connGetPrivateData(conn); + writeToClient(c,1); } /* This function is called just before entering the event loop, in the hope @@ -1240,26 +1300,24 @@ int handleClientsWithPendingWrites(void) { if (c->flags & CLIENT_PROTECTED) continue; /* Try to write buffers to the client socket. */ - if (writeToClient(c->fd,c,0) == C_ERR) continue; + if (writeToClient(c,0) == C_ERR) continue; /* If after the synchronous writes above we still have data to * output to the client, we need to install the writable handler. */ if (clientHasPendingReplies(c)) { - int ae_flags = AE_WRITABLE; + int ae_barrier = 0; /* For the fsync=always policy, we want that a given FD is never * served for reading and writing in the same event loop iteration, * so that in the middle of receiving the query, and serving it * to the client, we'll call beforeSleep() that will do the - * actual fsync of AOF to disk. AE_BARRIER ensures that. */ + * actual fsync of AOF to disk. the write barrier ensures that. */ if (server.aof_state == AOF_ON && server.aof_fsync == AOF_FSYNC_ALWAYS) { - ae_flags |= AE_BARRIER; + ae_barrier = 1; } - if (aeCreateFileEvent(server.el, c->fd, ae_flags, - sendReplyToClient, c) == AE_ERR) - { - freeClientAsync(c); + if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_barrier) == C_ERR) { + freeClientAsync(c); } } } @@ -1305,15 +1363,15 @@ void resetClient(client *c) { * path, it is not really released, but only marked for later release. */ void protectClient(client *c) { c->flags |= CLIENT_PROTECTED; - aeDeleteFileEvent(server.el,c->fd,AE_READABLE); - aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + connSetReadHandler(c->conn,NULL); + connSetWriteHandler(c->conn,NULL); } /* This will undo the client protection done by protectClient() */ void unprotectClient(client *c) { if (c->flags & CLIENT_PROTECTED) { c->flags &= ~CLIENT_PROTECTED; - aeCreateFileEvent(server.el,c->fd,AE_READABLE,readQueryFromClient,c); + connSetReadHandler(c->conn,readQueryFromClient); if (clientHasPendingReplies(c)) clientInstallWriteHandler(c); } } @@ -1710,12 +1768,10 @@ void processInputBufferAndReplicate(client *c) { } } -void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { - client *c = (client*) privdata; +void readQueryFromClient(connection *conn) { + client *c = connGetPrivateData(conn); int nread, readlen; size_t qblen; - UNUSED(el); - UNUSED(mask); /* Check if we want to read from the client later when exiting from * the event loop. This is the case if threaded I/O is enabled. */ @@ -1741,12 +1797,12 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { qblen = sdslen(c->querybuf); if (c->querybuf_peak < qblen) c->querybuf_peak = qblen; c->querybuf = sdsMakeRoomFor(c->querybuf, readlen); - nread = read(fd, c->querybuf+qblen, readlen); + nread = connRead(c->conn, c->querybuf+qblen, readlen); if (nread == -1) { - if (errno == EAGAIN) { + if (connGetState(conn) == CONN_STATE_CONNECTED) { return; } else { - serverLog(LL_VERBOSE, "Reading from client: %s",strerror(errno)); + serverLog(LL_VERBOSE, "Reading from client: %s",connGetLastError(c->conn)); freeClientAsync(c); return; } @@ -1818,7 +1874,7 @@ void genClientPeerId(client *client, char *peerid, snprintf(peerid,peerid_len,"%s:0",server.unixsocket); } else { /* TCP client. */ - anetFormatPeer(client->fd,peerid,peerid_len); + connFormatPeer(client->conn,peerid,peerid_len); } } @@ -1839,8 +1895,7 @@ char *getClientPeerId(client *c) { /* Concatenate a string representing the state of a client in an human * readable format, into the sds string 's'. */ sds catClientInfoString(sds s, client *client) { - char flags[16], events[3], *p; - int emask; + char flags[16], events[3], conninfo[CONN_INFO_LEN], *p; p = flags; if (client->flags & CLIENT_SLAVE) { @@ -1864,16 +1919,17 @@ sds catClientInfoString(sds s, client *client) { if (p == flags) *p++ = 'N'; *p++ = '\0'; - emask = client->fd == -1 ? 0 : aeGetFileEvents(server.el,client->fd); p = events; - if (emask & AE_READABLE) *p++ = 'r'; - if (emask & AE_WRITABLE) *p++ = 'w'; + if (client->conn) { + if (connHasReadHandler(client->conn)) *p++ = 'r'; + if (connHasWriteHandler(client->conn)) *p++ = 'w'; + } *p = '\0'; return sdscatfmt(s, - "id=%U addr=%s fd=%i name=%s age=%I idle=%I flags=%s db=%i sub=%i psub=%i multi=%i qbuf=%U qbuf-free=%U obl=%U oll=%U omem=%U events=%s cmd=%s user=%s", + "id=%U addr=%s %s name=%s age=%I idle=%I flags=%s db=%i sub=%i psub=%i multi=%i qbuf=%U qbuf-free=%U obl=%U oll=%U omem=%U events=%s cmd=%s user=%s", (unsigned long long) client->id, getClientPeerId(client), - client->fd, + connGetInfo(client->conn, conninfo, sizeof(conninfo)), client->name ? (char*)client->name->ptr : "", (long long)(server.unixtime - client->ctime), (long long)(server.unixtime - client->lastinteraction), @@ -2445,7 +2501,7 @@ int checkClientOutputBufferLimits(client *c) { * called from contexts where the client can't be freed safely, i.e. from the * lower level functions pushing data inside the client output buffers. */ void asyncCloseClientOnOutputBufferLimitReached(client *c) { - if (c->fd == -1) return; /* It is unsafe to free fake clients. */ + if (!c->conn) return; /* It is unsafe to free fake clients. */ serverAssert(c->reply_bytes < SIZE_MAX-(1024*64)); if (c->reply_bytes == 0 || c->flags & CLIENT_CLOSE_ASAP) return; if (checkClientOutputBufferLimits(c)) { @@ -2468,8 +2524,7 @@ void flushSlavesOutputBuffers(void) { listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = listNodeValue(ln); - int events = aeGetFileEvents(server.el,slave->fd); - int can_receive_writes = (events & AE_WRITABLE) || + int can_receive_writes = connHasWriteHandler(slave->conn) || (slave->flags & CLIENT_PENDING_WRITE); /* We don't want to send the pending data to the replica in a few @@ -2491,7 +2546,7 @@ void flushSlavesOutputBuffers(void) { !slave->repl_put_online_on_ack && clientHasPendingReplies(slave)) { - writeToClient(slave->fd,slave,0); + writeToClient(slave,0); } } } @@ -2618,9 +2673,9 @@ void *IOThreadMain(void *myid) { while((ln = listNext(&li))) { client *c = listNodeValue(ln); if (io_threads_op == IO_THREADS_OP_WRITE) { - writeToClient(c->fd,c,0); + writeToClient(c,0); } else if (io_threads_op == IO_THREADS_OP_READ) { - readQueryFromClient(NULL,c->fd,c,0); + readQueryFromClient(c->conn); } else { serverPanic("io_threads_op value is unknown"); } @@ -2761,8 +2816,7 @@ int handleClientsWithPendingWritesUsingThreads(void) { /* Install the write handler if there are pending writes in some * of the clients. */ if (clientHasPendingReplies(c) && - aeCreateFileEvent(server.el, c->fd, AE_WRITABLE, - sendReplyToClient, c) == AE_ERR) + connSetWriteHandler(c->conn, sendReplyToClient) == AE_ERR) { freeClientAsync(c); } diff --git a/src/rdb.c b/src/rdb.c index e430bcd58..2406ea88a 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2195,6 +2195,8 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { * own reference. */ decrRefCount(key); } + if (server.key_load_delay) + usleep(server.key_load_delay); /* Reset the state that is key-specified and is populated by * opcodes before the key, so that we start from scratch again. */ @@ -2290,8 +2292,6 @@ void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) { * This function covers the case of RDB -> Salves socket transfers for * diskless replication. */ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) { - uint64_t *ok_slaves; - if (!bysignal && exitcode == 0) { serverLog(LL_NOTICE, "Background RDB transfer terminated with success"); @@ -2305,79 +2305,6 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) { server.rdb_child_type = RDB_CHILD_TYPE_NONE; server.rdb_save_time_start = -1; - /* If the child returns an OK exit code, read the set of slave client - * IDs and the associated status code. We'll terminate all the slaves - * in error state. - * - * If the process returned an error, consider the list of slaves that - * can continue to be empty, so that it's just a special case of the - * normal code path. */ - ok_slaves = zmalloc(sizeof(uint64_t)); /* Make space for the count. */ - ok_slaves[0] = 0; - if (!bysignal && exitcode == 0) { - int readlen = sizeof(uint64_t); - - if (read(server.rdb_pipe_read_result_from_child, ok_slaves, readlen) == - readlen) - { - readlen = ok_slaves[0]*sizeof(uint64_t)*2; - - /* Make space for enough elements as specified by the first - * uint64_t element in the array. */ - ok_slaves = zrealloc(ok_slaves,sizeof(uint64_t)+readlen); - if (readlen && - read(server.rdb_pipe_read_result_from_child, ok_slaves+1, - readlen) != readlen) - { - ok_slaves[0] = 0; - } - } - } - - close(server.rdb_pipe_read_result_from_child); - close(server.rdb_pipe_write_result_to_parent); - - /* We can continue the replication process with all the slaves that - * correctly received the full payload. Others are terminated. */ - listNode *ln; - listIter li; - - listRewind(server.slaves,&li); - while((ln = listNext(&li))) { - client *slave = ln->value; - - if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) { - uint64_t j; - int errorcode = 0; - - /* Search for the slave ID in the reply. In order for a slave to - * continue the replication process, we need to find it in the list, - * and it must have an error code set to 0 (which means success). */ - for (j = 0; j < ok_slaves[0]; j++) { - if (slave->id == ok_slaves[2*j+1]) { - errorcode = ok_slaves[2*j+2]; - break; /* Found in slaves list. */ - } - } - if (j == ok_slaves[0] || errorcode != 0) { - serverLog(LL_WARNING, - "Closing slave %s: child->slave RDB transfer failed: %s", - replicationGetSlaveName(slave), - (errorcode == 0) ? "RDB transfer child aborted" - : strerror(errorcode)); - freeClient(slave); - } else { - serverLog(LL_WARNING, - "Slave %s correctly received the streamed RDB file.", - replicationGetSlaveName(slave)); - /* Restore the socket as non-blocking. */ - anetNonBlock(NULL,slave->fd); - anetSendTimeout(NULL,slave->fd,0); - } - } - } - zfree(ok_slaves); - updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_SOCKET); } @@ -2409,9 +2336,6 @@ void killRDBChild(void) { /* Spawn an RDB child that writes the RDB to the sockets of the slaves * that are currently in SLAVE_STATE_WAIT_BGSAVE_START state. */ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { - int *fds; - uint64_t *clientids; - int numfds; listNode *ln; listIter li; pid_t childpid; @@ -2419,35 +2343,30 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { if (hasActiveChildProcess()) return C_ERR; - /* Before to fork, create a pipe that will be used in order to - * send back to the parent the IDs of the slaves that successfully - * received all the writes. */ + /* Even if the previous fork child exited, don't start a new one until we + * drained the pipe. */ + if (server.rdb_pipe_conns) return C_ERR; + + /* Before to fork, create a pipe that is used to transfer the rdb bytes to + * the parent, we can't let it write directly to the sockets, since in case + * of TLS we must let the parent handle a continuous TLS state when the + * child terminates and parent takes over. */ if (pipe(pipefds) == -1) return C_ERR; - server.rdb_pipe_read_result_from_child = pipefds[0]; - server.rdb_pipe_write_result_to_parent = pipefds[1]; + server.rdb_pipe_read = pipefds[0]; + server.rdb_pipe_write = pipefds[1]; + anetNonBlock(NULL, server.rdb_pipe_read); - /* Collect the file descriptors of the slaves we want to transfer + /* Collect the connections of the replicas we want to transfer * the RDB to, which are i WAIT_BGSAVE_START state. */ - fds = zmalloc(sizeof(int)*listLength(server.slaves)); - /* We also allocate an array of corresponding client IDs. This will - * be useful for the child process in order to build the report - * (sent via unix pipe) that will be sent to the parent. */ - clientids = zmalloc(sizeof(uint64_t)*listLength(server.slaves)); - numfds = 0; - + server.rdb_pipe_conns = zmalloc(sizeof(connection *)*listLength(server.slaves)); + server.rdb_pipe_numconns = 0; + server.rdb_pipe_numconns_writing = 0; listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = ln->value; - if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) { - clientids[numfds] = slave->id; - fds[numfds++] = slave->fd; + server.rdb_pipe_conns[server.rdb_pipe_numconns++] = slave->conn; replicationSetupSlaveForFullResync(slave,getPsyncInitialOffset()); - /* Put the socket in blocking mode to simplify RDB transfer. - * We'll restore it when the children returns (since duped socket - * will share the O_NONBLOCK attribute with the parent). */ - anetBlock(NULL,slave->fd); - anetSendTimeout(NULL,slave->fd,server.repl_timeout*1000); } } @@ -2456,61 +2375,22 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { if ((childpid = redisFork()) == 0) { /* Child */ int retval; - rio slave_sockets; + rio rdb; - rioInitWithFdset(&slave_sockets,fds,numfds); - zfree(fds); + rioInitWithFd(&rdb,server.rdb_pipe_write); redisSetProcTitle("redis-rdb-to-slaves"); - retval = rdbSaveRioWithEOFMark(&slave_sockets,NULL,rsi); - if (retval == C_OK && rioFlush(&slave_sockets) == 0) + retval = rdbSaveRioWithEOFMark(&rdb,NULL,rsi); + if (retval == C_OK && rioFlush(&rdb) == 0) retval = C_ERR; if (retval == C_OK) { sendChildCOWInfo(CHILD_INFO_TYPE_RDB, "RDB"); - - /* If we are returning OK, at least one slave was served - * with the RDB file as expected, so we need to send a report - * to the parent via the pipe. The format of the message is: - * - * ... - * - * len, slave IDs, and slave errors, are all uint64_t integers, - * so basically the reply is composed of 64 bits for the len field - * plus 2 additional 64 bit integers for each entry, for a total - * of 'len' entries. - * - * The 'id' represents the slave's client ID, so that the master - * can match the report with a specific slave, and 'error' is - * set to 0 if the replication process terminated with a success - * or the error code if an error occurred. */ - void *msg = zmalloc(sizeof(uint64_t)*(1+2*numfds)); - uint64_t *len = msg; - uint64_t *ids = len+1; - int j, msglen; - - *len = numfds; - for (j = 0; j < numfds; j++) { - *ids++ = clientids[j]; - *ids++ = slave_sockets.io.fdset.state[j]; - } - - /* Write the message to the parent. If we have no good slaves or - * we are unable to transfer the message to the parent, we exit - * with an error so that the parent will abort the replication - * process with all the childre that were waiting. */ - msglen = sizeof(uint64_t)*(1+2*numfds); - if (*len == 0 || - write(server.rdb_pipe_write_result_to_parent,msg,msglen) - != msglen) - { - retval = C_ERR; - } - zfree(msg); } - zfree(clientids); - rioFreeFdset(&slave_sockets); + + rioFreeFd(&rdb); + close(server.rdb_pipe_write); /* wake up the reader, tell it we're done. */ exitFromChild((retval == C_OK) ? 0 : 1); } else { /* Parent */ @@ -2524,17 +2404,16 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = ln->value; - int j; - - for (j = 0; j < numfds; j++) { - if (slave->id == clientids[j]) { - slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START; - break; - } + if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) { + slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START; } } - close(pipefds[0]); - close(pipefds[1]); + close(server.rdb_pipe_write); + close(server.rdb_pipe_read); + zfree(server.rdb_pipe_conns); + server.rdb_pipe_conns = NULL; + server.rdb_pipe_numconns = 0; + server.rdb_pipe_numconns_writing = 0; closeChildInfoPipe(); } else { serverLog(LL_NOTICE,"Background RDB transfer started by pid %d", @@ -2542,9 +2421,11 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { server.rdb_save_time_start = time(NULL); server.rdb_child_pid = childpid; server.rdb_child_type = RDB_CHILD_TYPE_SOCKET; + close(server.rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */ + if (aeCreateFileEvent(server.el, server.rdb_pipe_read, AE_READABLE, rdbPipeReadHandler,NULL) == AE_ERR) { + serverPanic("Unrecoverable error creating server.rdb_pipe_read file event."); + } } - zfree(clientids); - zfree(fds); return (childpid == -1) ? C_ERR : C_OK; } return C_OK; /* Unreached. */ diff --git a/src/redis-cli.c b/src/redis-cli.c index 2830273bb..6d07f7ba6 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -47,6 +47,10 @@ #include #include +#ifdef USE_OPENSSL +#include +#include +#endif #include /* use sds.h from hiredis, so that only one set of sds functions will be present in the binary */ #include "dict.h" #include "adlist.h" @@ -188,6 +192,12 @@ static struct config { char *hostip; int hostport; char *hostsocket; + int tls; + char *sni; + char *cacert; + char *cacertdir; + char *cert; + char *key; long repeat; long interval; int dbnum; @@ -758,6 +768,71 @@ static int cliSelect(void) { return REDIS_ERR; } +/* Wrapper around redisSecureConnection to avoid hiredis_ssl dependencies if + * not building with TLS support. + */ +static int cliSecureConnection(redisContext *c, const char **err) { +#ifdef USE_OPENSSL + static SSL_CTX *ssl_ctx = NULL; + + if (!ssl_ctx) { + ssl_ctx = SSL_CTX_new(SSLv23_client_method()); + if (!ssl_ctx) { + *err = "Failed to create SSL_CTX"; + goto error; + } + + SSL_CTX_set_options(ssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3); + SSL_CTX_set_verify(ssl_ctx, SSL_VERIFY_PEER, NULL); + + if (config.cacert || config.cacertdir) { + if (!SSL_CTX_load_verify_locations(ssl_ctx, config.cacert, config.cacertdir)) { + *err = "Invalid CA Certificate File/Directory"; + goto error; + } + } else { + if (!SSL_CTX_set_default_verify_paths(ssl_ctx)) { + *err = "Failed to use default CA paths"; + goto error; + } + } + + if (config.cert && !SSL_CTX_use_certificate_chain_file(ssl_ctx, config.cert)) { + *err = "Invalid client certificate"; + goto error; + } + + if (config.key && !SSL_CTX_use_PrivateKey_file(ssl_ctx, config.key, SSL_FILETYPE_PEM)) { + *err = "Invalid private key"; + goto error; + } + } + + SSL *ssl = SSL_new(ssl_ctx); + if (!ssl) { + *err = "Failed to create SSL object"; + return REDIS_ERR; + } + + if (config.sni && !SSL_set_tlsext_host_name(ssl, config.sni)) { + *err = "Failed to configure SNI"; + SSL_free(ssl); + return REDIS_ERR; + } + + return redisInitiateSSL(c, ssl); + +error: + SSL_CTX_free(ssl_ctx); + ssl_ctx = NULL; + return REDIS_ERR; +#else + (void) c; + (void) err; + return REDIS_OK; +#endif +} + /* Select RESP3 mode if redis-cli was started with the -3 option. */ static int cliSwitchProto(void) { redisReply *reply; @@ -789,6 +864,16 @@ static int cliConnect(int flags) { context = redisConnectUnix(config.hostsocket); } + if (!context->err && config.tls) { + const char *err = NULL; + if (cliSecureConnection(context, &err) == REDIS_ERR && err) { + fprintf(stderr, "Could not negotiate a TLS connection: %s\n", err); + context = NULL; + redisFree(context); + return REDIS_ERR; + } + } + if (context->err) { if (!(flags & CC_QUIET)) { fprintf(stderr,"Could not connect to Redis at "); @@ -804,6 +889,7 @@ static int cliConnect(int flags) { return REDIS_ERR; } + /* Set aggressive KEEP_ALIVE socket option in the Redis context socket * in order to prevent timeouts caused by the execution of long * commands. At the same time this improves the detection of real @@ -1305,6 +1391,13 @@ static redisReply *reconnectingRedisCommand(redisContext *c, const char *fmt, .. redisFree(c); c = redisConnect(config.hostip,config.hostport); + if (!c->err && config.tls) { + const char *err = NULL; + if (cliSecureConnection(c, &err) == REDIS_ERR && err) { + fprintf(stderr, "TLS Error: %s\n", err); + exit(1); + } + } usleep(1000000); } @@ -1498,6 +1591,20 @@ static int parseOptions(int argc, char **argv) { } else if (!strcmp(argv[i],"--cluster-search-multiple-owners")) { config.cluster_manager_command.flags |= CLUSTER_MANAGER_CMD_FLAG_CHECK_OWNERS; +#ifdef USE_OPENSSL + } else if (!strcmp(argv[i],"--tls")) { + config.tls = 1; + } else if (!strcmp(argv[i],"--sni")) { + config.sni = argv[++i]; + } else if (!strcmp(argv[i],"--cacertdir")) { + config.cacertdir = argv[++i]; + } else if (!strcmp(argv[i],"--cacert")) { + config.cacert = argv[++i]; + } else if (!strcmp(argv[i],"--cert")) { + config.cert = argv[++i]; + } else if (!strcmp(argv[i],"--key")) { + config.key = argv[++i]; +#endif } else if (!strcmp(argv[i],"-v") || !strcmp(argv[i], "--version")) { sds version = cliVersion(); printf("redis-cli %s\n", version); @@ -1591,6 +1698,15 @@ static void usage(void) { " -x Read last argument from STDIN.\n" " -d Multi-bulk delimiter in for raw formatting (default: \\n).\n" " -c Enable cluster mode (follow -ASK and -MOVED redirections).\n" +#ifdef USE_OPENSSL +" --tls Establish a secure TLS connection.\n" +" --cacert CA Certificate file to verify with.\n" +" --cacertdir Directory where trusted CA certificates are stored.\n" +" If neither cacert nor cacertdir are specified, the default\n" +" system-wide trusted root certs configuration will apply.\n" +" --cert Client certificate to authenticate with.\n" +" --key Private key file to authenticate with.\n" +#endif " --raw Use raw formatting for replies (default when STDOUT is\n" " not a tty).\n" " --no-raw Force formatted output even when STDOUT is not a tty.\n" @@ -1615,7 +1731,9 @@ static void usage(void) { " --pipe Transfer raw Redis protocol from stdin to server.\n" " --pipe-timeout In --pipe mode, abort with error if after sending all data.\n" " no reply is received within seconds.\n" -" Default timeout: %d. Use 0 to wait forever.\n" +" Default timeout: %d. Use 0 to wait forever.\n", + REDIS_CLI_DEFAULT_PIPE_TIMEOUT); + fprintf(stderr, " --bigkeys Sample Redis keys looking for keys with many elements (complexity).\n" " --memkeys Sample Redis keys looking for keys consuming a lot of memory.\n" " --memkeys-samples Sample Redis keys looking for keys consuming a lot of memory.\n" @@ -1638,8 +1756,7 @@ static void usage(void) { " line interface.\n" " --help Output this help and exit.\n" " --version Output version and exit.\n" -"\n", - REDIS_CLI_DEFAULT_PIPE_TIMEOUT); +"\n"); /* Using another fprintf call to avoid -Woverlength-strings compile warning */ fprintf(stderr, "Cluster Manager Commands:\n" @@ -2407,6 +2524,15 @@ cleanup: static int clusterManagerNodeConnect(clusterManagerNode *node) { if (node->context) redisFree(node->context); node->context = redisConnect(node->ip, node->port); + if (!node->context->err && config.tls) { + const char *err = NULL; + if (cliSecureConnection(node->context, &err) == REDIS_ERR && err) { + fprintf(stderr,"TLS Error: %s\n", err); + redisFree(node->context); + node->context = NULL; + return 0; + } + } if (node->context->err) { fprintf(stderr,"Could not connect to Redis at "); fprintf(stderr,"%s:%d: %s\n", node->ip, node->port, diff --git a/src/replication.c b/src/replication.c index 5519b9ce2..3c916c9a7 100644 --- a/src/replication.c +++ b/src/replication.c @@ -39,7 +39,7 @@ #include void replicationDiscardCachedMaster(void); -void replicationResurrectCachedMaster(int newfd); +void replicationResurrectCachedMaster(connection *conn); void replicationSendAck(void); void putSlaveOnline(client *slave); int cancelReplicationHandshake(void); @@ -57,7 +57,7 @@ char *replicationGetSlaveName(client *c) { ip[0] = '\0'; buf[0] = '\0'; if (c->slave_ip[0] != '\0' || - anetPeerToString(c->fd,ip,sizeof(ip),NULL) != -1) + connPeerToString(c->conn,ip,sizeof(ip),NULL) != -1) { /* Note that the 'ip' buffer is always larger than 'c->slave_ip' */ if (c->slave_ip[0] != '\0') memcpy(ip,c->slave_ip,sizeof(c->slave_ip)); @@ -432,7 +432,7 @@ int replicationSetupSlaveForFullResync(client *slave, long long offset) { if (!(slave->flags & CLIENT_PRE_PSYNC)) { buflen = snprintf(buf,sizeof(buf),"+FULLRESYNC %s %lld\r\n", server.replid,offset); - if (write(slave->fd,buf,buflen) != buflen) { + if (connWrite(slave->conn,buf,buflen) != buflen) { freeClientAsync(slave); return C_ERR; } @@ -519,7 +519,7 @@ int masterTryPartialResynchronization(client *c) { } else { buflen = snprintf(buf,sizeof(buf),"+CONTINUE\r\n"); } - if (write(c->fd,buf,buflen) != buflen) { + if (connWrite(c->conn,buf,buflen) != buflen) { freeClientAsync(c); return C_OK; } @@ -685,7 +685,7 @@ void syncCommand(client *c) { * paths will change the state if we handle the slave differently. */ c->replstate = SLAVE_STATE_WAIT_BGSAVE_START; if (server.repl_disable_tcp_nodelay) - anetDisableTcpNoDelay(NULL, c->fd); /* Non critical if it fails. */ + connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */ c->repldbfd = -1; c->flags |= CLIENT_SLAVE; listAddNodeTail(server.slaves,c); @@ -862,8 +862,7 @@ void putSlaveOnline(client *slave) { slave->replstate = SLAVE_STATE_ONLINE; slave->repl_put_online_on_ack = 0; slave->repl_ack_time = server.unixtime; /* Prevent false timeout. */ - if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, - sendReplyToClient, slave) == AE_ERR) { + if (connSetWriteHandler(slave->conn, sendReplyToClient) == C_ERR) { serverLog(LL_WARNING,"Unable to register writable event for replica bulk transfer: %s", strerror(errno)); freeClient(slave); return; @@ -873,10 +872,8 @@ void putSlaveOnline(client *slave) { replicationGetSlaveName(slave)); } -void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { - client *slave = privdata; - UNUSED(el); - UNUSED(mask); +void sendBulkToSlave(connection *conn) { + client *slave = connGetPrivateData(conn); char buf[PROTO_IOBUF_LEN]; ssize_t nwritten, buflen; @@ -884,10 +881,10 @@ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { * replication process. Currently the preamble is just the bulk count of * the file in the form "$\r\n". */ if (slave->replpreamble) { - nwritten = write(fd,slave->replpreamble,sdslen(slave->replpreamble)); + nwritten = connWrite(conn,slave->replpreamble,sdslen(slave->replpreamble)); if (nwritten == -1) { serverLog(LL_VERBOSE,"Write error sending RDB preamble to replica: %s", - strerror(errno)); + connGetLastError(conn)); freeClient(slave); return; } @@ -911,10 +908,10 @@ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { freeClient(slave); return; } - if ((nwritten = write(fd,buf,buflen)) == -1) { - if (errno != EAGAIN) { + if ((nwritten = connWrite(conn,buf,buflen)) == -1) { + if (connGetState(conn) != CONN_STATE_CONNECTED) { serverLog(LL_WARNING,"Write error sending DB to replica: %s", - strerror(errno)); + connGetLastError(conn)); freeClient(slave); } return; @@ -924,11 +921,157 @@ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { if (slave->repldboff == slave->repldbsize) { close(slave->repldbfd); slave->repldbfd = -1; - aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); + connSetWriteHandler(slave->conn,NULL); putSlaveOnline(slave); } } +/* Remove one write handler from the list of connections waiting to be writable + * during rdb pipe transfer. */ +void rdbPipeWriteHandlerConnRemoved(struct connection *conn) { + if (!connHasWriteHandler(conn)) + return; + connSetWriteHandler(conn, NULL); + server.rdb_pipe_numconns_writing--; + /* if there are no more writes for now for this conn, or write error: */ + if (server.rdb_pipe_numconns_writing == 0) { + if (aeCreateFileEvent(server.el, server.rdb_pipe_read, AE_READABLE, rdbPipeReadHandler,NULL) == AE_ERR) { + serverPanic("Unrecoverable error creating server.rdb_pipe_read file event."); + } + } +} + +/* Called in diskless master during transfer of data from the rdb pipe, when + * the replica becomes writable again. */ +void rdbPipeWriteHandler(struct connection *conn) { + serverAssert(server.rdb_pipe_bufflen>0); + client *slave = connGetPrivateData(conn); + int nwritten; + if ((nwritten = connWrite(conn, server.rdb_pipe_buff + slave->repldboff, + server.rdb_pipe_bufflen - slave->repldboff)) == -1) + { + if (connGetState(conn) == CONN_STATE_CONNECTED) + return; /* equivalent to EAGAIN */ + serverLog(LL_WARNING,"Write error sending DB to replica: %s", + connGetLastError(conn)); + freeClient(slave); + return; + } else { + slave->repldboff += nwritten; + server.stat_net_output_bytes += nwritten; + if (slave->repldboff < server.rdb_pipe_bufflen) + return; /* more data to write.. */ + } + rdbPipeWriteHandlerConnRemoved(conn); +} + +/* When the the pipe serving diskless rdb transfer is drained (write end was + * closed), we can clean up all the temporary variables, and cleanup after the + * fork child. */ +void RdbPipeCleanup() { + close(server.rdb_pipe_read); + zfree(server.rdb_pipe_conns); + server.rdb_pipe_conns = NULL; + server.rdb_pipe_numconns = 0; + server.rdb_pipe_numconns_writing = 0; + zfree(server.rdb_pipe_buff); + server.rdb_pipe_buff = NULL; + server.rdb_pipe_bufflen = 0; + + /* Since we're avoiding to detect the child exited as long as the pipe is + * not drained, so now is the time to check. */ + checkChildrenDone(); +} + +/* Called in diskless master, when there's data to read from the child's rdb pipe */ +void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask) { + UNUSED(mask); + UNUSED(clientData); + UNUSED(eventLoop); + int i; + if (!server.rdb_pipe_buff) + server.rdb_pipe_buff = zmalloc(PROTO_IOBUF_LEN); + serverAssert(server.rdb_pipe_numconns_writing==0); + + while (1) { + server.rdb_pipe_bufflen = read(fd, server.rdb_pipe_buff, PROTO_IOBUF_LEN); + if (server.rdb_pipe_bufflen < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + return; + serverLog(LL_WARNING,"Diskless rdb transfer, read error sending DB to replicas: %s", strerror(errno)); + for (i=0; i < server.rdb_pipe_numconns; i++) { + connection *conn = server.rdb_pipe_conns[i]; + if (!conn) + continue; + client *slave = connGetPrivateData(conn); + freeClient(slave); + server.rdb_pipe_conns[i] = NULL; + } + killRDBChild(); + return; + } + + if (server.rdb_pipe_bufflen == 0) { + /* EOF - write end was closed. */ + int stillUp = 0; + aeDeleteFileEvent(server.el, server.rdb_pipe_read, AE_READABLE); + for (i=0; i < server.rdb_pipe_numconns; i++) + { + connection *conn = server.rdb_pipe_conns[i]; + if (!conn) + continue; + stillUp++; + } + serverLog(LL_WARNING,"Diskless rdb transfer, done reading from pipe, %d replicas still up.", stillUp); + RdbPipeCleanup(); + return; + } + + int stillAlive = 0; + for (i=0; i < server.rdb_pipe_numconns; i++) + { + int nwritten; + connection *conn = server.rdb_pipe_conns[i]; + if (!conn) + continue; + + client *slave = connGetPrivateData(conn); + if ((nwritten = connWrite(conn, server.rdb_pipe_buff, server.rdb_pipe_bufflen)) == -1) { + if (connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_WARNING,"Diskless rdb transfer, write error sending DB to replica: %s", + connGetLastError(conn)); + freeClient(slave); + server.rdb_pipe_conns[i] = NULL; + continue; + } + /* An error and still in connected state, is equivalent to EAGAIN */ + slave->repldboff = 0; + } else { + slave->repldboff = nwritten; + server.stat_net_output_bytes += nwritten; + } + /* If we were unable to write all the data to one of the replicas, + * setup write handler (and disable pipe read handler, below) */ + if (nwritten != server.rdb_pipe_bufflen) { + server.rdb_pipe_numconns_writing++; + connSetWriteHandler(conn, rdbPipeWriteHandler); + } + stillAlive++; + } + + if (stillAlive == 0) { + serverLog(LL_WARNING,"Diskless rdb transfer, last replica dropped, killing fork child."); + killRDBChild(); + RdbPipeCleanup(); + } + /* Remove the pipe read handler if at least one write handler was set. */ + if (server.rdb_pipe_numconns_writing || stillAlive == 0) { + aeDeleteFileEvent(server.el, server.rdb_pipe_read, AE_READABLE); + break; + } + } +} + /* This function is called at the end of every background saving, * or when the replication RDB transfer strategy is modified from * disk to socket or the other way around. @@ -1015,8 +1158,8 @@ void updateSlavesWaitingBgsave(int bgsaveerr, int type) { slave->replpreamble = sdscatprintf(sdsempty(),"$%lld\r\n", (unsigned long long) slave->repldbsize); - aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); - if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) { + connSetWriteHandler(slave->conn,NULL); + if (connSetWriteHandler(slave->conn,sendBulkToSlave) == C_ERR) { freeClient(slave); continue; } @@ -1084,9 +1227,8 @@ void replicationSendNewlineToMaster(void) { static time_t newline_sent; if (time(NULL) != newline_sent) { newline_sent = time(NULL); - if (write(server.repl_transfer_s,"\n",1) == -1) { - /* Pinging back in this stage is best-effort. */ - } + /* Pinging back in this stage is best-effort. */ + if (server.repl_transfer_s) connWrite(server.repl_transfer_s, "\n", 1); } } @@ -1100,8 +1242,10 @@ void replicationEmptyDbCallback(void *privdata) { /* Once we have a link with the master and the synchroniziation was * performed, this function materializes the master client we store * at server.master, starting from the specified file descriptor. */ -void replicationCreateMasterClient(int fd, int dbid) { - server.master = createClient(fd); +void replicationCreateMasterClient(connection *conn, int dbid) { + server.master = createClient(conn); + if (conn) + connSetReadHandler(server.master->conn, readQueryFromClient); server.master->flags |= CLIENT_MASTER; server.master->authenticated = 1; server.master->reploff = server.master_initial_offset; @@ -1196,7 +1340,7 @@ void disklessLoadRestoreBackups(redisDb *backup, int restore, int empty_db_flags /* Asynchronously read the SYNC payload we receive from a master */ #define REPL_MAX_WRITTEN_BEFORE_FSYNC (1024*1024*8) /* 8 MB */ -void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { +void readSyncBulkPayload(connection *conn) { char buf[4096]; ssize_t nread, readlen, nwritten; int use_diskless_load; @@ -1204,9 +1348,6 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { int empty_db_flags = server.repl_slave_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS; off_t left; - UNUSED(el); - UNUSED(privdata); - UNUSED(mask); /* Static vars used to hold the EOF mark, and the last bytes received * form the server: when they match, we reached the end of the transfer. */ @@ -1217,7 +1358,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { /* If repl_transfer_size == -1 we still have to read the bulk length * from the master reply. */ if (server.repl_transfer_size == -1) { - if (syncReadLine(fd,buf,1024,server.repl_syncio_timeout*1000) == -1) { + if (connSyncReadLine(conn,buf,1024,server.repl_syncio_timeout*1000) == -1) { serverLog(LL_WARNING, "I/O error reading bulk count from MASTER: %s", strerror(errno)); @@ -1282,7 +1423,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { readlen = (left < (signed)sizeof(buf)) ? left : (signed)sizeof(buf); } - nread = read(fd,buf,readlen); + nread = connRead(conn,buf,readlen); if (nread <= 0) { serverLog(LL_WARNING,"I/O error trying to sync with MASTER: %s", (nread == -1) ? strerror(errno) : "connection lost"); @@ -1390,17 +1531,17 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { * handler, otherwise it will get called recursively since * rdbLoad() will call the event loop to process events from time to * time for non blocking loading. */ - aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE); + connSetReadHandler(conn, NULL); serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Loading DB in memory"); rdbSaveInfo rsi = RDB_SAVE_INFO_INIT; if (use_diskless_load) { rio rdb; - rioInitWithFd(&rdb,fd,server.repl_transfer_size); + rioInitWithConn(&rdb,conn,server.repl_transfer_size); /* Put the socket in blocking mode to simplify RDB transfer. * We'll restore it when the RDB is received. */ - anetBlock(NULL,fd); - anetRecvTimeout(NULL,fd,server.repl_timeout*1000); + connBlock(conn); + connRecvTimeout(conn, server.repl_timeout*1000); startLoading(server.repl_transfer_size); if (rdbLoadRio(&rdb,&rsi,0) != C_OK) { @@ -1410,7 +1551,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { "Failed trying to load the MASTER synchronization DB " "from socket"); cancelReplicationHandshake(); - rioFreeFd(&rdb, NULL); + rioFreeConn(&rdb, NULL); if (server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) { /* Restore the backed up databases. */ disklessLoadRestoreBackups(diskless_load_backup,1, @@ -1443,16 +1584,16 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { { serverLog(LL_WARNING,"Replication stream EOF marker is broken"); cancelReplicationHandshake(); - rioFreeFd(&rdb, NULL); + rioFreeConn(&rdb, NULL); return; } } /* Cleanup and restore the socket to the original state to continue * with the normal replication. */ - rioFreeFd(&rdb, NULL); - anetNonBlock(NULL,fd); - anetRecvTimeout(NULL,fd,0); + rioFreeConn(&rdb, NULL); + connNonBlock(conn); + connRecvTimeout(conn,0); } else { /* Ensure background save doesn't overwrite synced data */ if (server.rdb_child_pid != -1) { @@ -1529,7 +1670,7 @@ error: #define SYNC_CMD_READ (1<<0) #define SYNC_CMD_WRITE (1<<1) #define SYNC_CMD_FULL (SYNC_CMD_READ|SYNC_CMD_WRITE) -char *sendSynchronousCommand(int flags, int fd, ...) { +char *sendSynchronousCommand(int flags, connection *conn, ...) { /* Create the command to send to the master, we use redis binary * protocol to make sure correct arguments are sent. This function @@ -1540,7 +1681,7 @@ char *sendSynchronousCommand(int flags, int fd, ...) { sds cmd = sdsempty(); sds cmdargs = sdsempty(); size_t argslen = 0; - va_start(ap,fd); + va_start(ap,conn); while(1) { arg = va_arg(ap, char*); @@ -1557,12 +1698,12 @@ char *sendSynchronousCommand(int flags, int fd, ...) { sdsfree(cmdargs); /* Transfer command to the server. */ - if (syncWrite(fd,cmd,sdslen(cmd),server.repl_syncio_timeout*1000) + if (connSyncWrite(conn,cmd,sdslen(cmd),server.repl_syncio_timeout*1000) == -1) { sdsfree(cmd); return sdscatprintf(sdsempty(),"-Writing to master: %s", - strerror(errno)); + connGetLastError(conn)); } sdsfree(cmd); } @@ -1571,7 +1712,7 @@ char *sendSynchronousCommand(int flags, int fd, ...) { if (flags & SYNC_CMD_READ) { char buf[256]; - if (syncReadLine(fd,buf,sizeof(buf),server.repl_syncio_timeout*1000) + if (connSyncReadLine(conn,buf,sizeof(buf),server.repl_syncio_timeout*1000) == -1) { return sdscatprintf(sdsempty(),"-Reading from master: %s", @@ -1637,7 +1778,7 @@ char *sendSynchronousCommand(int flags, int fd, ...) { #define PSYNC_FULLRESYNC 3 #define PSYNC_NOT_SUPPORTED 4 #define PSYNC_TRY_LATER 5 -int slaveTryPartialResynchronization(int fd, int read_reply) { +int slaveTryPartialResynchronization(connection *conn, int read_reply) { char *psync_replid; char psync_offset[32]; sds reply; @@ -1662,18 +1803,18 @@ int slaveTryPartialResynchronization(int fd, int read_reply) { } /* Issue the PSYNC command */ - reply = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PSYNC",psync_replid,psync_offset,NULL); + reply = sendSynchronousCommand(SYNC_CMD_WRITE,conn,"PSYNC",psync_replid,psync_offset,NULL); if (reply != NULL) { serverLog(LL_WARNING,"Unable to send PSYNC to master: %s",reply); sdsfree(reply); - aeDeleteFileEvent(server.el,fd,AE_READABLE); + connSetReadHandler(conn, NULL); return PSYNC_WRITE_ERROR; } return PSYNC_WAIT_REPLY; } /* Reading half */ - reply = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL); + reply = sendSynchronousCommand(SYNC_CMD_READ,conn,NULL); if (sdslen(reply) == 0) { /* The master may send empty newlines after it receives PSYNC * and before to reply, just to keep the connection alive. */ @@ -1681,7 +1822,7 @@ int slaveTryPartialResynchronization(int fd, int read_reply) { return PSYNC_WAIT_REPLY; } - aeDeleteFileEvent(server.el,fd,AE_READABLE); + connSetReadHandler(conn, NULL); if (!strncmp(reply,"+FULLRESYNC",11)) { char *replid = NULL, *offset = NULL; @@ -1755,7 +1896,7 @@ int slaveTryPartialResynchronization(int fd, int read_reply) { /* Setup the replication to continue. */ sdsfree(reply); - replicationResurrectCachedMaster(fd); + replicationResurrectCachedMaster(conn); /* If this instance was restarted and we read the metadata to * PSYNC from the persistence file, our replication backlog could @@ -1797,29 +1938,23 @@ int slaveTryPartialResynchronization(int fd, int read_reply) { /* This handler fires when the non blocking connect was able to * establish a connection with the master. */ -void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { +void syncWithMaster(connection *conn) { char tmpfile[256], *err = NULL; int dfd = -1, maxtries = 5; - int sockerr = 0, psync_result; - socklen_t errlen = sizeof(sockerr); - UNUSED(el); - UNUSED(privdata); - UNUSED(mask); + int psync_result; /* If this event fired after the user turned the instance into a master * with SLAVEOF NO ONE we must just return ASAP. */ if (server.repl_state == REPL_STATE_NONE) { - close(fd); + connClose(conn); return; } /* Check for errors in the socket: after a non blocking connect() we * may find that the socket is in error state. */ - if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &sockerr, &errlen) == -1) - sockerr = errno; - if (sockerr) { + if (connGetState(conn) != CONN_STATE_CONNECTED) { serverLog(LL_WARNING,"Error condition on socket for SYNC: %s", - strerror(sockerr)); + connGetLastError(conn)); goto error; } @@ -1828,18 +1963,19 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { serverLog(LL_NOTICE,"Non blocking connect for SYNC fired the event."); /* Delete the writable event so that the readable event remains * registered and we can wait for the PONG reply. */ - aeDeleteFileEvent(server.el,fd,AE_WRITABLE); + connSetReadHandler(conn, syncWithMaster); + connSetWriteHandler(conn, NULL); server.repl_state = REPL_STATE_RECEIVE_PONG; /* Send the PING, don't check for errors at all, we have the timeout * that will take care about this. */ - err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PING",NULL); + err = sendSynchronousCommand(SYNC_CMD_WRITE,conn,"PING",NULL); if (err) goto write_error; return; } /* Receive the PONG command. */ if (server.repl_state == REPL_STATE_RECEIVE_PONG) { - err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL); + err = sendSynchronousCommand(SYNC_CMD_READ,conn,NULL); /* We accept only two replies as valid, a positive +PONG reply * (we just check for "+") or an authentication error. @@ -1864,13 +2000,13 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { /* AUTH with the master if required. */ if (server.repl_state == REPL_STATE_SEND_AUTH) { if (server.masteruser && server.masterauth) { - err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH", + err = sendSynchronousCommand(SYNC_CMD_WRITE,conn,"AUTH", server.masteruser,server.masterauth,NULL); if (err) goto write_error; server.repl_state = REPL_STATE_RECEIVE_AUTH; return; } else if (server.masterauth) { - err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",server.masterauth,NULL); + err = sendSynchronousCommand(SYNC_CMD_WRITE,conn,"AUTH",server.masterauth,NULL); if (err) goto write_error; server.repl_state = REPL_STATE_RECEIVE_AUTH; return; @@ -1881,7 +2017,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { /* Receive AUTH reply. */ if (server.repl_state == REPL_STATE_RECEIVE_AUTH) { - err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL); + err = sendSynchronousCommand(SYNC_CMD_READ,conn,NULL); if (err[0] == '-') { serverLog(LL_WARNING,"Unable to AUTH to MASTER: %s",err); sdsfree(err); @@ -1894,11 +2030,14 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { /* Set the slave port, so that Master's INFO command can list the * slave listening port correctly. */ if (server.repl_state == REPL_STATE_SEND_PORT) { - sds port = sdsfromlonglong(server.slave_announce_port ? - server.slave_announce_port : server.port); - err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF", - "listening-port",port, NULL); - sdsfree(port); + int port; + if (server.slave_announce_port) port = server.slave_announce_port; + else if (server.tls_replication && server.tls_port) port = server.tls_port; + else port = server.port; + sds portstr = sdsfromlonglong(port); + err = sendSynchronousCommand(SYNC_CMD_WRITE,conn,"REPLCONF", + "listening-port",portstr, NULL); + sdsfree(portstr); if (err) goto write_error; sdsfree(err); server.repl_state = REPL_STATE_RECEIVE_PORT; @@ -1907,7 +2046,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { /* Receive REPLCONF listening-port reply. */ if (server.repl_state == REPL_STATE_RECEIVE_PORT) { - err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL); + err = sendSynchronousCommand(SYNC_CMD_READ,conn,NULL); /* Ignore the error if any, not all the Redis versions support * REPLCONF listening-port. */ if (err[0] == '-') { @@ -1928,7 +2067,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { /* Set the slave ip, so that Master's INFO command can list the * slave IP address port correctly in case of port forwarding or NAT. */ if (server.repl_state == REPL_STATE_SEND_IP) { - err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF", + err = sendSynchronousCommand(SYNC_CMD_WRITE,conn,"REPLCONF", "ip-address",server.slave_announce_ip, NULL); if (err) goto write_error; sdsfree(err); @@ -1938,7 +2077,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { /* Receive REPLCONF ip-address reply. */ if (server.repl_state == REPL_STATE_RECEIVE_IP) { - err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL); + err = sendSynchronousCommand(SYNC_CMD_READ,conn,NULL); /* Ignore the error if any, not all the Redis versions support * REPLCONF listening-port. */ if (err[0] == '-') { @@ -1956,7 +2095,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { * * The master will ignore capabilities it does not understand. */ if (server.repl_state == REPL_STATE_SEND_CAPA) { - err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF", + err = sendSynchronousCommand(SYNC_CMD_WRITE,conn,"REPLCONF", "capa","eof","capa","psync2",NULL); if (err) goto write_error; sdsfree(err); @@ -1966,7 +2105,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { /* Receive CAPA reply. */ if (server.repl_state == REPL_STATE_RECEIVE_CAPA) { - err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL); + err = sendSynchronousCommand(SYNC_CMD_READ,conn,NULL); /* Ignore the error if any, not all the Redis versions support * REPLCONF capa. */ if (err[0] == '-') { @@ -1983,7 +2122,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { * and the global offset, to try a partial resync at the next * reconnection attempt. */ if (server.repl_state == REPL_STATE_SEND_PSYNC) { - if (slaveTryPartialResynchronization(fd,0) == PSYNC_WRITE_ERROR) { + if (slaveTryPartialResynchronization(conn,0) == PSYNC_WRITE_ERROR) { err = sdsnew("Write error sending the PSYNC command."); goto write_error; } @@ -1999,7 +2138,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { goto error; } - psync_result = slaveTryPartialResynchronization(fd,1); + psync_result = slaveTryPartialResynchronization(conn,1); if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */ /* If the master is in an transient error, we should try to PSYNC @@ -2028,7 +2167,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { * already populated. */ if (psync_result == PSYNC_NOT_SUPPORTED) { serverLog(LL_NOTICE,"Retrying with SYNC..."); - if (syncWrite(fd,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) { + if (connSyncWrite(conn,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) { serverLog(LL_WARNING,"I/O error writing to MASTER: %s", strerror(errno)); goto error; @@ -2053,12 +2192,13 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { } /* Setup the non blocking download of the bulk file. */ - if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL) - == AE_ERR) + if (connSetReadHandler(conn, readSyncBulkPayload) + == C_ERR) { + char conninfo[CONN_INFO_LEN]; serverLog(LL_WARNING, - "Can't create readable event for SYNC: %s (fd=%d)", - strerror(errno),fd); + "Can't create readable event for SYNC: %s (%s)", + strerror(errno), connGetInfo(conn, conninfo, sizeof(conninfo))); goto error; } @@ -2070,16 +2210,15 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) { return; error: - aeDeleteFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE); if (dfd != -1) close(dfd); - close(fd); + connClose(conn); + server.repl_transfer_s = NULL; if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd); if (server.repl_transfer_tmpfile) zfree(server.repl_transfer_tmpfile); server.repl_transfer_tmpfile = NULL; server.repl_transfer_fd = -1; - server.repl_transfer_s = -1; server.repl_state = REPL_STATE_CONNECT; return; @@ -2090,26 +2229,18 @@ write_error: /* Handle sendSynchronousCommand(SYNC_CMD_WRITE) errors. */ } int connectWithMaster(void) { - int fd; - - fd = anetTcpNonBlockBestEffortBindConnect(NULL, - server.masterhost,server.masterport,NET_FIRST_BIND_ADDR); - if (fd == -1) { + server.repl_transfer_s = server.tls_replication ? connCreateTLS() : connCreateSocket(); + if (connConnect(server.repl_transfer_s, server.masterhost, server.masterport, + NET_FIRST_BIND_ADDR, syncWithMaster) == C_ERR) { serverLog(LL_WARNING,"Unable to connect to MASTER: %s", - strerror(errno)); + connGetLastError(server.repl_transfer_s)); + connClose(server.repl_transfer_s); + server.repl_transfer_s = NULL; return C_ERR; } - if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) == - AE_ERR) - { - close(fd); - serverLog(LL_WARNING,"Can't create readable event for SYNC"); - return C_ERR; - } server.repl_transfer_lastio = server.unixtime; - server.repl_transfer_s = fd; server.repl_state = REPL_STATE_CONNECTING; return C_OK; } @@ -2119,11 +2250,8 @@ int connectWithMaster(void) { * Never call this function directly, use cancelReplicationHandshake() instead. */ void undoConnectWithMaster(void) { - int fd = server.repl_transfer_s; - - aeDeleteFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE); - close(fd); - server.repl_transfer_s = -1; + connClose(server.repl_transfer_s); + server.repl_transfer_s = NULL; } /* Abort the async download of the bulk dataset while SYNC-ing with master. @@ -2311,7 +2439,7 @@ void roleCommand(client *c) { char ip[NET_IP_STR_LEN], *slaveip = slave->slave_ip; if (slaveip[0] == '\0') { - if (anetPeerToString(slave->fd,ip,sizeof(ip),NULL) == -1) + if (connPeerToString(slave->conn,ip,sizeof(ip),NULL) == -1) continue; slaveip = ip; } @@ -2433,7 +2561,7 @@ void replicationCacheMasterUsingMyself(void) { /* The master client we create can be set to any DBID, because * the new master will start its replication stream with SELECT. */ server.master_initial_offset = server.master_repl_offset; - replicationCreateMasterClient(-1,-1); + replicationCreateMasterClient(NULL,-1); /* Use our own ID / offset. */ memcpy(server.master->replid, server.replid, sizeof(server.replid)); @@ -2462,10 +2590,11 @@ void replicationDiscardCachedMaster(void) { * This function is called when successfully setup a partial resynchronization * so the stream of data that we'll receive will start from were this * master left. */ -void replicationResurrectCachedMaster(int newfd) { +void replicationResurrectCachedMaster(connection *conn) { server.master = server.cached_master; server.cached_master = NULL; - server.master->fd = newfd; + server.master->conn = conn; + connSetPrivateData(server.master->conn, server.master); server.master->flags &= ~(CLIENT_CLOSE_AFTER_REPLY|CLIENT_CLOSE_ASAP); server.master->authenticated = 1; server.master->lastinteraction = server.unixtime; @@ -2474,8 +2603,7 @@ void replicationResurrectCachedMaster(int newfd) { /* Re-add to the list of clients. */ linkClient(server.master); - if (aeCreateFileEvent(server.el, newfd, AE_READABLE, - readQueryFromClient, server.master)) { + if (connSetReadHandler(server.master->conn, readQueryFromClient)) { serverLog(LL_WARNING,"Error resurrecting the cached master, impossible to add the readable handler: %s", strerror(errno)); freeClientAsync(server.master); /* Close ASAP. */ } @@ -2483,8 +2611,7 @@ void replicationResurrectCachedMaster(int newfd) { /* We may also need to install the write handler as well if there is * pending data in the write buffers. */ if (clientHasPendingReplies(server.master)) { - if (aeCreateFileEvent(server.el, newfd, AE_WRITABLE, - sendReplyToClient, server.master)) { + if (connSetWriteHandler(server.master->conn, sendReplyToClient)) { serverLog(LL_WARNING,"Error resurrecting the cached master, impossible to add the writable handler: %s", strerror(errno)); freeClientAsync(server.master); /* Close ASAP. */ } @@ -2854,9 +2981,7 @@ void replicationCron(void) { server.rdb_child_type != RDB_CHILD_TYPE_SOCKET)); if (is_presync) { - if (write(slave->fd, "\n", 1) == -1) { - /* Don't worry about socket errors, it's just a ping. */ - } + connWrite(slave->conn, "\n", 1); } } diff --git a/src/rio.c b/src/rio.c index bdbc5d0e9..c8c924380 100644 --- a/src/rio.c +++ b/src/rio.c @@ -159,13 +159,13 @@ void rioInitWithFile(rio *r, FILE *fp) { r->io.file.autosync = 0; } -/* ------------------- File descriptor implementation ------------------- +/* ------------------- Connection implementation ------------------- * We use this RIO implemetnation when reading an RDB file directly from - * the socket to the memory via rdbLoadRio(), thus this implementation - * only implements reading from a file descriptor that is, normally, + * the connection to the memory via rdbLoadRio(), thus this implementation + * only implements reading from a connection that is, normally, * just a socket. */ -static size_t rioFdWrite(rio *r, const void *buf, size_t len) { +static size_t rioConnWrite(rio *r, const void *buf, size_t len) { UNUSED(r); UNUSED(buf); UNUSED(len); @@ -173,57 +173,175 @@ static size_t rioFdWrite(rio *r, const void *buf, size_t len) { } /* Returns 1 or 0 for success/failure. */ -static size_t rioFdRead(rio *r, void *buf, size_t len) { - size_t avail = sdslen(r->io.fd.buf)-r->io.fd.pos; +static size_t rioConnRead(rio *r, void *buf, size_t len) { + size_t avail = sdslen(r->io.conn.buf)-r->io.conn.pos; /* If the buffer is too small for the entire request: realloc. */ - if (sdslen(r->io.fd.buf) + sdsavail(r->io.fd.buf) < len) - r->io.fd.buf = sdsMakeRoomFor(r->io.fd.buf, len - sdslen(r->io.fd.buf)); + if (sdslen(r->io.conn.buf) + sdsavail(r->io.conn.buf) < len) + r->io.conn.buf = sdsMakeRoomFor(r->io.conn.buf, len - sdslen(r->io.conn.buf)); /* If the remaining unused buffer is not large enough: memmove so that we * can read the rest. */ - if (len > avail && sdsavail(r->io.fd.buf) < len - avail) { - sdsrange(r->io.fd.buf, r->io.fd.pos, -1); - r->io.fd.pos = 0; + if (len > avail && sdsavail(r->io.conn.buf) < len - avail) { + sdsrange(r->io.conn.buf, r->io.conn.pos, -1); + r->io.conn.pos = 0; } /* If we don't already have all the data in the sds, read more */ - while (len > sdslen(r->io.fd.buf) - r->io.fd.pos) { - size_t buffered = sdslen(r->io.fd.buf) - r->io.fd.pos; + while (len > sdslen(r->io.conn.buf) - r->io.conn.pos) { + size_t buffered = sdslen(r->io.conn.buf) - r->io.conn.pos; size_t toread = len - buffered; /* Read either what's missing, or PROTO_IOBUF_LEN, the bigger of * the two. */ if (toread < PROTO_IOBUF_LEN) toread = PROTO_IOBUF_LEN; - if (toread > sdsavail(r->io.fd.buf)) toread = sdsavail(r->io.fd.buf); - if (r->io.fd.read_limit != 0 && - r->io.fd.read_so_far + buffered + toread > r->io.fd.read_limit) + if (toread > sdsavail(r->io.conn.buf)) toread = sdsavail(r->io.conn.buf); + if (r->io.conn.read_limit != 0 && + r->io.conn.read_so_far + buffered + toread > r->io.conn.read_limit) { - if (r->io.fd.read_limit >= r->io.fd.read_so_far - buffered) - toread = r->io.fd.read_limit - r->io.fd.read_so_far - buffered; + if (r->io.conn.read_limit >= r->io.conn.read_so_far - buffered) + toread = r->io.conn.read_limit - r->io.conn.read_so_far - buffered; else { errno = EOVERFLOW; return 0; } } - int retval = read(r->io.fd.fd, - (char*)r->io.fd.buf + sdslen(r->io.fd.buf), + int retval = connRead(r->io.conn.conn, + (char*)r->io.conn.buf + sdslen(r->io.conn.buf), toread); if (retval <= 0) { if (errno == EWOULDBLOCK) errno = ETIMEDOUT; return 0; } - sdsIncrLen(r->io.fd.buf, retval); + sdsIncrLen(r->io.conn.buf, retval); } - memcpy(buf, (char*)r->io.fd.buf + r->io.fd.pos, len); - r->io.fd.read_so_far += len; - r->io.fd.pos += len; + memcpy(buf, (char*)r->io.conn.buf + r->io.conn.pos, len); + r->io.conn.read_so_far += len; + r->io.conn.pos += len; return len; } +/* Returns read/write position in file. */ +static off_t rioConnTell(rio *r) { + return r->io.conn.read_so_far; +} + +/* Flushes any buffer to target device if applicable. Returns 1 on success + * and 0 on failures. */ +static int rioConnFlush(rio *r) { + /* Our flush is implemented by the write method, that recognizes a + * buffer set to NULL with a count of zero as a flush request. */ + return rioConnWrite(r,NULL,0); +} + +static const rio rioConnIO = { + rioConnRead, + rioConnWrite, + rioConnTell, + rioConnFlush, + NULL, /* update_checksum */ + 0, /* current checksum */ + 0, /* flags */ + 0, /* bytes read or written */ + 0, /* read/write chunk size */ + { { NULL, 0 } } /* union for io-specific vars */ +}; + +/* Create an RIO that implements a buffered read from an fd + * read_limit argument stops buffering when the reaching the limit. */ +void rioInitWithConn(rio *r, connection *conn, size_t read_limit) { + *r = rioConnIO; + r->io.conn.conn = conn; + r->io.conn.pos = 0; + r->io.conn.read_limit = read_limit; + r->io.conn.read_so_far = 0; + r->io.conn.buf = sdsnewlen(NULL, PROTO_IOBUF_LEN); + sdsclear(r->io.conn.buf); +} + +/* Release the RIO tream. Optionally returns the unread buffered data + * when the SDS pointer 'remaining' is passed. */ +void rioFreeConn(rio *r, sds *remaining) { + if (remaining && (size_t)r->io.conn.pos < sdslen(r->io.conn.buf)) { + if (r->io.conn.pos > 0) sdsrange(r->io.conn.buf, r->io.conn.pos, -1); + *remaining = r->io.conn.buf; + } else { + sdsfree(r->io.conn.buf); + if (remaining) *remaining = NULL; + } + r->io.conn.buf = NULL; +} + +/* ------------------- File descriptor implementation ------------------ + * This target is used to write the RDB file to pipe, when the master just + * streams the data to the replicas without creating an RDB on-disk image + * (diskless replication option). + * It only implements writes. */ + +/* Returns 1 or 0 for success/failure. + * + * When buf is NULL and len is 0, the function performs a flush operation + * if there is some pending buffer, so this function is also used in order + * to implement rioFdFlush(). */ +static size_t rioFdWrite(rio *r, const void *buf, size_t len) { + ssize_t retval; + unsigned char *p = (unsigned char*) buf; + int doflush = (buf == NULL && len == 0); + + /* For small writes, we rather keep the data in user-space buffer, and flush + * it only when it grows. however for larger writes, we prefer to flush + * any pre-existing buffer, and write the new one directly without reallocs + * and memory copying. */ + if (len > PROTO_IOBUF_LEN) { + /* First, flush any pre-existing buffered data. */ + if (sdslen(r->io.fd.buf)) { + if (rioFdWrite(r, NULL, 0) == 0) + return 0; + } + /* Write the new data, keeping 'p' and 'len' from the input. */ + } else { + if (len) { + r->io.fd.buf = sdscatlen(r->io.fd.buf,buf,len); + if (sdslen(r->io.fd.buf) > PROTO_IOBUF_LEN) + doflush = 1; + if (!doflush) + return 1; + } + /* Flusing the buffered data. set 'p' and 'len' accordintly. */ + p = (unsigned char*) r->io.fd.buf; + len = sdslen(r->io.fd.buf); + } + + size_t nwritten = 0; + while(nwritten != len) { + retval = write(r->io.fd.fd,p+nwritten,len-nwritten); + if (retval <= 0) { + /* With blocking io, which is the sole user of this + * rio target, EWOULDBLOCK is returned only because of + * the SO_SNDTIMEO socket option, so we translate the error + * into one more recognizable by the user. */ + if (retval == -1 && errno == EWOULDBLOCK) errno = ETIMEDOUT; + return 0; /* error. */ + } + nwritten += retval; + } + + r->io.fd.pos += len; + sdsclear(r->io.fd.buf); + return 1; +} + +/* Returns 1 or 0 for success/failure. */ +static size_t rioFdRead(rio *r, void *buf, size_t len) { + UNUSED(r); + UNUSED(buf); + UNUSED(len); + return 0; /* Error, this target does not support reading. */ +} + /* Returns read/write position in file. */ static off_t rioFdTell(rio *r) { - return r->io.fd.read_so_far; + return r->io.fd.pos; } /* Flushes any buffer to target device if applicable. Returns 1 on success @@ -247,160 +365,16 @@ static const rio rioFdIO = { { { NULL, 0 } } /* union for io-specific vars */ }; -/* Create an RIO that implements a buffered read from an fd - * read_limit argument stops buffering when the reaching the limit. */ -void rioInitWithFd(rio *r, int fd, size_t read_limit) { +void rioInitWithFd(rio *r, int fd) { *r = rioFdIO; r->io.fd.fd = fd; r->io.fd.pos = 0; - r->io.fd.read_limit = read_limit; - r->io.fd.read_so_far = 0; - r->io.fd.buf = sdsnewlen(NULL, PROTO_IOBUF_LEN); - sdsclear(r->io.fd.buf); -} - -/* Release the RIO tream. Optionally returns the unread buffered data - * when the SDS pointer 'remaining' is passed. */ -void rioFreeFd(rio *r, sds *remaining) { - if (remaining && (size_t)r->io.fd.pos < sdslen(r->io.fd.buf)) { - if (r->io.fd.pos > 0) sdsrange(r->io.fd.buf, r->io.fd.pos, -1); - *remaining = r->io.fd.buf; - } else { - sdsfree(r->io.fd.buf); - if (remaining) *remaining = NULL; - } - r->io.fd.buf = NULL; -} - -/* ------------------- File descriptors set implementation ------------------ - * This target is used to write the RDB file to N different replicas via - * sockets, when the master just streams the data to the replicas without - * creating an RDB on-disk image (diskless replication option). - * It only implements writes. */ - -/* Returns 1 or 0 for success/failure. - * The function returns success as long as we are able to correctly write - * to at least one file descriptor. - * - * When buf is NULL and len is 0, the function performs a flush operation - * if there is some pending buffer, so this function is also used in order - * to implement rioFdsetFlush(). */ -static size_t rioFdsetWrite(rio *r, const void *buf, size_t len) { - ssize_t retval; - int j; - unsigned char *p = (unsigned char*) buf; - int doflush = (buf == NULL && len == 0); - - /* To start we always append to our buffer. If it gets larger than - * a given size, we actually write to the sockets. */ - if (len) { - r->io.fdset.buf = sdscatlen(r->io.fdset.buf,buf,len); - len = 0; /* Prevent entering the while below if we don't flush. */ - if (sdslen(r->io.fdset.buf) > PROTO_IOBUF_LEN) doflush = 1; - } - - if (doflush) { - p = (unsigned char*) r->io.fdset.buf; - len = sdslen(r->io.fdset.buf); - } - - /* Write in little chunchs so that when there are big writes we - * parallelize while the kernel is sending data in background to - * the TCP socket. */ - while(len) { - size_t count = len < 1024 ? len : 1024; - int broken = 0; - for (j = 0; j < r->io.fdset.numfds; j++) { - if (r->io.fdset.state[j] != 0) { - /* Skip FDs alraedy in error. */ - broken++; - continue; - } - - /* Make sure to write 'count' bytes to the socket regardless - * of short writes. */ - size_t nwritten = 0; - while(nwritten != count) { - retval = write(r->io.fdset.fds[j],p+nwritten,count-nwritten); - if (retval <= 0) { - /* With blocking sockets, which is the sole user of this - * rio target, EWOULDBLOCK is returned only because of - * the SO_SNDTIMEO socket option, so we translate the error - * into one more recognizable by the user. */ - if (retval == -1 && errno == EWOULDBLOCK) errno = ETIMEDOUT; - break; - } - nwritten += retval; - } - - if (nwritten != count) { - /* Mark this FD as broken. */ - r->io.fdset.state[j] = errno; - if (r->io.fdset.state[j] == 0) r->io.fdset.state[j] = EIO; - } - } - if (broken == r->io.fdset.numfds) return 0; /* All the FDs in error. */ - p += count; - len -= count; - r->io.fdset.pos += count; - } - - if (doflush) sdsclear(r->io.fdset.buf); - return 1; -} - -/* Returns 1 or 0 for success/failure. */ -static size_t rioFdsetRead(rio *r, void *buf, size_t len) { - UNUSED(r); - UNUSED(buf); - UNUSED(len); - return 0; /* Error, this target does not support reading. */ -} - -/* Returns read/write position in file. */ -static off_t rioFdsetTell(rio *r) { - return r->io.fdset.pos; -} - -/* Flushes any buffer to target device if applicable. Returns 1 on success - * and 0 on failures. */ -static int rioFdsetFlush(rio *r) { - /* Our flush is implemented by the write method, that recognizes a - * buffer set to NULL with a count of zero as a flush request. */ - return rioFdsetWrite(r,NULL,0); -} - -static const rio rioFdsetIO = { - rioFdsetRead, - rioFdsetWrite, - rioFdsetTell, - rioFdsetFlush, - NULL, /* update_checksum */ - 0, /* current checksum */ - 0, /* flags */ - 0, /* bytes read or written */ - 0, /* read/write chunk size */ - { { NULL, 0 } } /* union for io-specific vars */ -}; - -void rioInitWithFdset(rio *r, int *fds, int numfds) { - int j; - - *r = rioFdsetIO; - r->io.fdset.fds = zmalloc(sizeof(int)*numfds); - r->io.fdset.state = zmalloc(sizeof(int)*numfds); - memcpy(r->io.fdset.fds,fds,sizeof(int)*numfds); - for (j = 0; j < numfds; j++) r->io.fdset.state[j] = 0; - r->io.fdset.numfds = numfds; - r->io.fdset.pos = 0; - r->io.fdset.buf = sdsempty(); + r->io.fd.buf = sdsempty(); } /* release the rio stream. */ -void rioFreeFdset(rio *r) { - zfree(r->io.fdset.fds); - zfree(r->io.fdset.state); - sdsfree(r->io.fdset.buf); +void rioFreeFd(rio *r) { + sdsfree(r->io.fd.buf); } /* ---------------------------- Generic functions ---------------------------- */ diff --git a/src/rio.h b/src/rio.h index eb7a05748..9576335e8 100644 --- a/src/rio.h +++ b/src/rio.h @@ -35,6 +35,7 @@ #include #include #include "sds.h" +#include "connection.h" #define RIO_FLAG_READ_ERROR (1<<0) #define RIO_FLAG_WRITE_ERROR (1<<1) @@ -76,22 +77,20 @@ struct _rio { off_t buffered; /* Bytes written since last fsync. */ off_t autosync; /* fsync after 'autosync' bytes written. */ } file; - /* file descriptor */ + /* Connection object (used to read from socket) */ struct { - int fd; /* File descriptor. */ + connection *conn; /* Connection */ off_t pos; /* pos in buf that was returned */ sds buf; /* buffered data */ size_t read_limit; /* don't allow to buffer/read more than that */ size_t read_so_far; /* amount of data read from the rio (not buffered) */ - } fd; - /* Multiple FDs target (used to write to N sockets). */ + } conn; + /* FD target (used to write to pipe). */ struct { - int *fds; /* File descriptors. */ - int *state; /* Error state of each fd. 0 (if ok) or errno. */ - int numfds; + int fd; /* File descriptor. */ off_t pos; sds buf; - } fdset; + } fd; } io; }; @@ -159,11 +158,11 @@ static inline void rioClearErrors(rio *r) { void rioInitWithFile(rio *r, FILE *fp); void rioInitWithBuffer(rio *r, sds s); -void rioInitWithFd(rio *r, int fd, size_t read_limit); -void rioInitWithFdset(rio *r, int *fds, int numfds); +void rioInitWithConn(rio *r, connection *conn, size_t read_limit); +void rioInitWithFd(rio *r, int fd); -void rioFreeFdset(rio *r); -void rioFreeFd(rio *r, sds* out_remainingBufferedData); +void rioFreeFd(rio *r); +void rioFreeConn(rio *r, sds* out_remainingBufferedData); size_t rioWriteBulkCount(rio *r, char prefix, long count); size_t rioWriteBulkString(rio *r, const char *buf, size_t len); diff --git a/src/scripting.c b/src/scripting.c index ec95eb256..7cf21f408 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -61,7 +61,7 @@ sds ldbCatStackValue(sds s, lua_State *lua, int idx); #define LDB_BREAKPOINTS_MAX 64 /* Max number of breakpoints. */ #define LDB_MAX_LEN_DEFAULT 256 /* Default len limit for replies / var dumps. */ struct ldbState { - int fd; /* Socket of the debugging client. */ + connection *conn; /* Connection of the debugging client. */ int active; /* Are we debugging EVAL right now? */ int forked; /* Is this a fork()ed debugging session? */ list *logs; /* List of messages to send to the client. */ @@ -1243,7 +1243,7 @@ void scriptingInit(int setup) { * Note: there is no need to create it again when this function is called * by scriptingReset(). */ if (server.lua_client == NULL) { - server.lua_client = createClient(-1); + server.lua_client = createClient(NULL); server.lua_client->flags |= CLIENT_LUA; } @@ -1734,7 +1734,7 @@ NULL /* Initialize Lua debugger data structures. */ void ldbInit(void) { - ldb.fd = -1; + ldb.conn = NULL; ldb.active = 0; ldb.logs = listCreate(); listSetFreeMethod(ldb.logs,(void (*)(void*))sdsfree); @@ -1756,7 +1756,7 @@ void ldbFlushLog(list *log) { void ldbEnable(client *c) { c->flags |= CLIENT_LUA_DEBUG; ldbFlushLog(ldb.logs); - ldb.fd = c->fd; + ldb.conn = c->conn; ldb.step = 1; ldb.bpcount = 0; ldb.luabp = 0; @@ -1811,7 +1811,7 @@ void ldbSendLogs(void) { proto = sdscatlen(proto,"\r\n",2); listDelNode(ldb.logs,ln); } - if (write(ldb.fd,proto,sdslen(proto)) == -1) { + if (connWrite(ldb.conn,proto,sdslen(proto)) == -1) { /* Avoid warning. We don't check the return value of write() * since the next read() will catch the I/O error and will * close the debugging session. */ @@ -1863,8 +1863,8 @@ int ldbStartSession(client *c) { } /* Setup our debugging session. */ - anetBlock(NULL,ldb.fd); - anetSendTimeout(NULL,ldb.fd,5000); + connBlock(ldb.conn); + connSendTimeout(ldb.conn,5000); ldb.active = 1; /* First argument of EVAL is the script itself. We split it into different @@ -1891,7 +1891,7 @@ void ldbEndSession(client *c) { /* If it's a fork()ed session, we just exit. */ if (ldb.forked) { - writeToClient(c->fd, c, 0); + writeToClient(c,0); serverLog(LL_WARNING,"Lua debugging session child exiting"); exitFromChild(0); } else { @@ -1900,8 +1900,8 @@ void ldbEndSession(client *c) { } /* Otherwise let's restore client's state. */ - anetNonBlock(NULL,ldb.fd); - anetSendTimeout(NULL,ldb.fd,0); + connNonBlock(ldb.conn); + connSendTimeout(ldb.conn,0); /* Close the client connectin after sending the final EVAL reply * in order to signal the end of the debugging session. */ @@ -2538,7 +2538,7 @@ int ldbRepl(lua_State *lua) { while(1) { while((argv = ldbReplParseCommand(&argc)) == NULL) { char buf[1024]; - int nread = read(ldb.fd,buf,sizeof(buf)); + int nread = connRead(ldb.conn,buf,sizeof(buf)); if (nread <= 0) { /* Make sure the script runs without user input since the * client is no longer connected. */ diff --git a/src/sentinel.c b/src/sentinel.c index 92ea75436..0490db4e9 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -30,6 +30,10 @@ #include "server.h" #include "hiredis.h" +#ifdef USE_OPENSSL +#include "openssl/ssl.h" +#include "hiredis_ssl.h" +#endif #include "async.h" #include @@ -40,6 +44,10 @@ extern char **environ; +#ifdef USE_OPENSSL +extern SSL_CTX *redis_tls_ctx; +#endif + #define REDIS_SENTINEL_PORT 26379 /* ======================== Sentinel global state =========================== */ @@ -1995,6 +2003,19 @@ void sentinelSetClientName(sentinelRedisInstance *ri, redisAsyncContext *c, char } } +static int instanceLinkNegotiateTLS(redisAsyncContext *context) { +#ifndef USE_OPENSSL + (void) context; +#else + if (!redis_tls_ctx) return C_ERR; + SSL *ssl = SSL_new(redis_tls_ctx); + if (!ssl) return C_ERR; + + if (redisInitiateSSL(&context->c, ssl) == REDIS_ERR) return C_ERR; +#endif + return C_OK; +} + /* Create the async connections for the instance link if the link * is disconnected. Note that link->disconnected is true even if just * one of the two links (commands and pub/sub) is missing. */ @@ -2010,7 +2031,11 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { /* Commands connection. */ if (link->cc == NULL) { link->cc = redisAsyncConnectBind(ri->addr->ip,ri->addr->port,NET_FIRST_BIND_ADDR); - if (link->cc->err) { + if (!link->cc->err && server.tls_replication && + (instanceLinkNegotiateTLS(link->cc) == C_ERR)) { + sentinelEvent(LL_DEBUG,"-cmd-link-reconnection",ri,"%@ #Failed to initialize TLS"); + instanceLinkCloseConnection(link,link->cc); + } else if (link->cc->err) { sentinelEvent(LL_DEBUG,"-cmd-link-reconnection",ri,"%@ #%s", link->cc->errstr); instanceLinkCloseConnection(link,link->cc); @@ -2033,7 +2058,10 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { /* Pub / Sub */ if ((ri->flags & (SRI_MASTER|SRI_SLAVE)) && link->pc == NULL) { link->pc = redisAsyncConnectBind(ri->addr->ip,ri->addr->port,NET_FIRST_BIND_ADDR); - if (link->pc->err) { + if (!link->pc->err && server.tls_replication && + (instanceLinkNegotiateTLS(link->pc) == C_ERR)) { + sentinelEvent(LL_DEBUG,"-pubsub-link-reconnection",ri,"%@ #Failed to initialize TLS"); + } else if (link->pc->err) { sentinelEvent(LL_DEBUG,"-pubsub-link-reconnection",ri,"%@ #%s", link->pc->errstr); instanceLinkCloseConnection(link,link->pc); @@ -2584,8 +2612,9 @@ int sentinelSendHello(sentinelRedisInstance *ri) { return C_ERR; announce_ip = ip; } - announce_port = sentinel.announce_port ? - sentinel.announce_port : server.port; + if (sentinel.announce_port) announce_port = sentinel.announce_port; + else if (server.tls_replication && server.tls_port) announce_port = server.tls_port; + else announce_port = server.port; /* Format and send the Hello message. */ snprintf(payload,sizeof(payload), diff --git a/src/server.c b/src/server.c index f67175651..d16ff0a8e 100644 --- a/src/server.c +++ b/src/server.c @@ -1752,6 +1752,62 @@ void updateCachedTime(void) { server.daylight_active = tm.tm_isdst; } +void checkChildrenDone(void) { + int statloc; + pid_t pid; + + /* If we have a diskless rdb child (note that we support only one concurrent + * child), we want to avoid collecting it's exit status and acting on it + * as long as we didn't finish to drain the pipe, since then we're at risk + * of starting a new fork and a new pipe before we're done with the previous + * one. */ + if (server.rdb_child_pid != -1 && server.rdb_pipe_conns) + return; + + if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { + int exitcode = WEXITSTATUS(statloc); + int bysignal = 0; + + if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc); + + /* sigKillChildHandler catches the signal and calls exit(), but we + * must make sure not to flag lastbgsave_status, etc incorrectly. + * We could directly terminate the child process via SIGUSR1 + * without handling it, but in this case Valgrind will log an + * annoying error. */ + if (exitcode == SERVER_CHILD_NOERROR_RETVAL) { + bysignal = SIGUSR1; + exitcode = 1; + } + + if (pid == -1) { + serverLog(LL_WARNING,"wait3() returned an error: %s. " + "rdb_child_pid = %d, aof_child_pid = %d, module_child_pid = %d", + strerror(errno), + (int) server.rdb_child_pid, + (int) server.aof_child_pid, + (int) server.module_child_pid); + } else if (pid == server.rdb_child_pid) { + backgroundSaveDoneHandler(exitcode,bysignal); + if (!bysignal && exitcode == 0) receiveChildInfo(); + } else if (pid == server.aof_child_pid) { + backgroundRewriteDoneHandler(exitcode,bysignal); + if (!bysignal && exitcode == 0) receiveChildInfo(); + } else if (pid == server.module_child_pid) { + ModuleForkDoneHandler(exitcode,bysignal); + if (!bysignal && exitcode == 0) receiveChildInfo(); + } else { + if (!ldbRemoveChild(pid)) { + serverLog(LL_WARNING, + "Warning, detected child with unmatched pid: %ld", + (long)pid); + } + } + updateDictResizePolicy(); + closeChildInfoPipe(); + } +} + /* This is our timer interrupt, called server.hz times per second. * Here is where we do a number of things that need to be done asynchronously. * For instance: @@ -1903,51 +1959,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { /* Check if a background saving or AOF rewrite in progress terminated. */ if (hasActiveChildProcess() || ldbPendingChildren()) { - int statloc; - pid_t pid; - - if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { - int exitcode = WEXITSTATUS(statloc); - int bysignal = 0; - - if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc); - - /* sigKillChildHandler catches the signal and calls exit(), but we - * must make sure not to flag lastbgsave_status, etc incorrectly. - * We could directly terminate the child process via SIGUSR1 - * without handling it, but in this case Valgrind will log an - * annoying error. */ - if (exitcode == SERVER_CHILD_NOERROR_RETVAL) { - bysignal = SIGUSR1; - exitcode = 1; - } - - if (pid == -1) { - serverLog(LL_WARNING,"wait3() returned an error: %s. " - "rdb_child_pid = %d, aof_child_pid = %d, module_child_pid = %d", - strerror(errno), - (int) server.rdb_child_pid, - (int) server.aof_child_pid, - (int) server.module_child_pid); - } else if (pid == server.rdb_child_pid) { - backgroundSaveDoneHandler(exitcode,bysignal); - if (!bysignal && exitcode == 0) receiveChildInfo(); - } else if (pid == server.aof_child_pid) { - backgroundRewriteDoneHandler(exitcode,bysignal); - if (!bysignal && exitcode == 0) receiveChildInfo(); - } else if (pid == server.module_child_pid) { - ModuleForkDoneHandler(exitcode,bysignal); - if (!bysignal && exitcode == 0) receiveChildInfo(); - } else { - if (!ldbRemoveChild(pid)) { - serverLog(LL_WARNING, - "Warning, detected child with unmatched pid: %ld", - (long)pid); - } - } - updateDictResizePolicy(); - closeChildInfoPipe(); - } + checkChildrenDone(); } else { /* If there is not a background saving/rewrite in progress check if * we have to save/rewrite now. */ @@ -2054,6 +2066,11 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { void beforeSleep(struct aeEventLoop *eventLoop) { UNUSED(eventLoop); + /* Handle TLS pending data. (must be done before flushAppendOnlyFile) */ + tlsProcessPendingData(); + /* If tls still has pending unread data don't sleep at all. */ + aeSetDontWait(server.el, tlsHasPendingData()); + /* Call the Redis Cluster before sleep function. Note that this function * may change the state of Redis Cluster (from ok to fail or vice versa), * so it's a good idea to call it before serving the unblocked clients @@ -2247,11 +2264,13 @@ void initServerConfig(void) { server.dynamic_hz = CONFIG_DEFAULT_DYNAMIC_HZ; server.arch_bits = (sizeof(long) == 8) ? 64 : 32; server.port = CONFIG_DEFAULT_SERVER_PORT; + server.tls_port = CONFIG_DEFAULT_SERVER_TLS_PORT; server.tcp_backlog = CONFIG_DEFAULT_TCP_BACKLOG; server.bindaddr_count = 0; server.unixsocket = NULL; server.unixsocketperm = CONFIG_DEFAULT_UNIX_SOCKET_PERM; server.ipfd_count = 0; + server.tlsfd_count = 0; server.sofd = -1; server.protected_mode = CONFIG_DEFAULT_PROTECTED_MODE; server.gopher_enabled = CONFIG_DEFAULT_GOPHER_ENABLED; @@ -2286,6 +2305,7 @@ void initServerConfig(void) { server.aof_rewrite_min_size = AOF_REWRITE_MIN_SIZE; server.aof_rewrite_base_size = 0; server.aof_rewrite_scheduled = 0; + server.aof_flush_sleep = 0; server.aof_last_fsync = time(NULL); server.aof_rewrite_time_last = -1; server.aof_rewrite_time_start = -1; @@ -2297,6 +2317,7 @@ void initServerConfig(void) { server.aof_rewrite_incremental_fsync = CONFIG_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC; server.rdb_save_incremental_fsync = CONFIG_DEFAULT_RDB_SAVE_INCREMENTAL_FSYNC; server.rdb_key_save_delay = CONFIG_DEFAULT_RDB_KEY_SAVE_DELAY; + server.key_load_delay = CONFIG_DEFAULT_KEY_LOAD_DELAY; server.aof_load_truncated = CONFIG_DEFAULT_AOF_LOAD_TRUNCATED; server.aof_use_rdb_preamble = CONFIG_DEFAULT_AOF_USE_RDB_PREAMBLE; server.pidfile = NULL; @@ -2368,7 +2389,7 @@ void initServerConfig(void) { server.repl_state = REPL_STATE_NONE; server.repl_transfer_tmpfile = NULL; server.repl_transfer_fd = -1; - server.repl_transfer_s = -1; + server.repl_transfer_s = NULL; server.repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT; server.repl_serve_stale_data = CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA; server.repl_slave_ro = CONFIG_DEFAULT_SLAVE_READ_ONLY; @@ -2765,6 +2786,11 @@ void initServer(void) { server.clients_paused = 0; server.system_memory_size = zmalloc_get_memory_size(); + if (server.tls_port && tlsConfigure(&server.tls_ctx_config) == C_ERR) { + serverLog(LL_WARNING, "Failed to configure TLS. Check logs for more info."); + exit(1); + } + createSharedObjects(); adjustOpenFilesLimit(); server.el = aeCreateEventLoop(server.maxclients+CONFIG_FDSET_INCR); @@ -2780,6 +2806,9 @@ void initServer(void) { if (server.port != 0 && listenToPort(server.port,server.ipfd,&server.ipfd_count) == C_ERR) exit(1); + if (server.tls_port != 0 && + listenToPort(server.tls_port,server.tlsfd,&server.tlsfd_count) == C_ERR) + exit(1); /* Open the listening Unix domain socket. */ if (server.unixsocket != NULL) { @@ -2794,7 +2823,7 @@ void initServer(void) { } /* Abort if there are no listening sockets at all. */ - if (server.ipfd_count == 0 && server.sofd < 0) { + if (server.ipfd_count == 0 && server.tlsfd_count == 0 && server.sofd < 0) { serverLog(LL_WARNING, "Configured to not listen anywhere, exiting."); exit(1); } @@ -2820,6 +2849,11 @@ void initServer(void) { server.aof_child_pid = -1; server.module_child_pid = -1; server.rdb_child_type = RDB_CHILD_TYPE_NONE; + server.rdb_pipe_conns = NULL; + server.rdb_pipe_numconns = 0; + server.rdb_pipe_numconns_writing = 0; + server.rdb_pipe_buff = NULL; + server.rdb_pipe_bufflen = 0; server.rdb_bgsave_scheduled = 0; server.child_info_pipe[0] = -1; server.child_info_pipe[1] = -1; @@ -2866,6 +2900,14 @@ void initServer(void) { "Unrecoverable error creating server.ipfd file event."); } } + for (j = 0; j < server.tlsfd_count; j++) { + if (aeCreateFileEvent(server.el, server.tlsfd[j], AE_READABLE, + acceptTLSHandler,NULL) == AE_ERR) + { + serverPanic( + "Unrecoverable error creating server.tlsfd file event."); + } + } if (server.sofd > 0 && aeCreateFileEvent(server.el,server.sofd,AE_READABLE, acceptUnixHandler,NULL) == AE_ERR) serverPanic("Unrecoverable error creating server.sofd file event."); @@ -3570,6 +3612,7 @@ void closeListeningSockets(int unlink_unix_socket) { int j; for (j = 0; j < server.ipfd_count; j++) close(server.ipfd[j]); + for (j = 0; j < server.tlsfd_count; j++) close(server.tlsfd[j]); if (server.sofd != -1) close(server.sofd); if (server.cluster_enabled) for (j = 0; j < server.cfd_count; j++) close(server.cfd[j]); @@ -3940,7 +3983,7 @@ sds genRedisInfoString(char *section) { #endif (int64_t) getpid(), server.runid, - server.port, + server.port ? server.port : server.tls_port, (int64_t)uptime, (int64_t)(uptime/(3600*24)), server.hz, @@ -4324,7 +4367,7 @@ sds genRedisInfoString(char *section) { long lag = 0; if (slaveip[0] == '\0') { - if (anetPeerToString(slave->fd,ip,sizeof(ip),&port) == -1) + if (connPeerToString(slave->conn,ip,sizeof(ip),&port) == -1) continue; slaveip = ip; } @@ -4578,7 +4621,7 @@ void redisAsciiArt(void) { if (!show_logo) { serverLog(LL_NOTICE, "Running mode=%s, port=%d.", - mode, server.port + mode, server.port ? server.port : server.tls_port ); } else { snprintf(buf,1024*16,ascii_logo, @@ -4586,7 +4629,7 @@ void redisAsciiArt(void) { redisGitSHA1(), strtol(redisGitDirty(),NULL,10) > 0, (sizeof(long) == 8) ? "64" : "32", - mode, server.port, + mode, server.port ? server.port : server.tls_port, (long) getpid() ); serverLogRaw(LL_NOTICE|LL_RAW,buf); @@ -4769,7 +4812,7 @@ void redisSetProcTitle(char *title) { setproctitle("%s %s:%d%s", title, server.bindaddr_count ? server.bindaddr[0] : "*", - server.port, + server.port ? server.port : server.tls_port, server_mode); #else UNUSED(title); @@ -4920,6 +4963,7 @@ int main(int argc, char **argv) { ACLInit(); /* The ACL subsystem must be initialized ASAP because the basic networking code and client creation depends on it. */ moduleInitModulesSystem(); + tlsInit(); /* Store the executable path and arguments in a safe place in order * to be able to restart the server later. */ @@ -5053,7 +5097,7 @@ int main(int argc, char **argv) { exit(1); } } - if (server.ipfd_count > 0) + if (server.ipfd_count > 0 || server.tlsfd_count > 0) serverLog(LL_NOTICE,"Ready to accept connections"); if (server.sofd > 0) serverLog(LL_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket); diff --git a/src/server.h b/src/server.h index a14989237..00b54bd35 100644 --- a/src/server.h +++ b/src/server.h @@ -66,6 +66,7 @@ typedef long long mstime_t; /* millisecond time type. */ #include "quicklist.h" /* Lists are encoded as linked lists of N-elements flat arrays */ #include "rax.h" /* Radix tree */ +#include "connection.h" /* Connection abstraction */ /* Following includes allow test functions to be called from Redis main() */ #include "zipmap.h" @@ -84,6 +85,7 @@ typedef long long mstime_t; /* millisecond time type. */ #define CONFIG_MAX_HZ 500 #define MAX_CLIENTS_PER_CLOCK_TICK 200 /* HZ is adapted based on that. */ #define CONFIG_DEFAULT_SERVER_PORT 6379 /* TCP port. */ +#define CONFIG_DEFAULT_SERVER_TLS_PORT 0 /* TCP port. */ #define CONFIG_DEFAULT_TCP_BACKLOG 511 /* TCP listen backlog. */ #define CONFIG_DEFAULT_CLIENT_TIMEOUT 0 /* Default client timeout: infinite */ #define CONFIG_DEFAULT_DBNUM 16 @@ -133,6 +135,7 @@ typedef long long mstime_t; /* millisecond time type. */ #define CONFIG_DEFAULT_REPL_DISKLESS_SYNC 0 #define CONFIG_DEFAULT_REPL_DISKLESS_SYNC_DELAY 5 #define CONFIG_DEFAULT_RDB_KEY_SAVE_DELAY 0 +#define CONFIG_DEFAULT_KEY_LOAD_DELAY 0 #define CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA 1 #define CONFIG_DEFAULT_SLAVE_READ_ONLY 1 #define CONFIG_DEFAULT_SLAVE_IGNORE_MAXMEMORY 1 @@ -826,7 +829,7 @@ typedef struct user { * Clients are taken in a linked list. */ typedef struct client { uint64_t id; /* Client incremental unique ID. */ - int fd; /* Client socket. */ + connection *conn; int resp; /* RESP protocol version. Can be 2 or 3. */ redisDb *db; /* Pointer to currently SELECTed DB. */ robj *name; /* As set by CLIENT SETNAME. */ @@ -1034,6 +1037,22 @@ struct malloc_stats { size_t allocator_resident; }; +/*----------------------------------------------------------------------------- + * TLS Context Configuration + *----------------------------------------------------------------------------*/ + +typedef struct redisTLSContextConfig { + char *cert_file; + char *key_file; + char *dh_params_file; + char *ca_cert_file; + char *ca_cert_dir; + char *protocols; + char *ciphers; + char *ciphersuites; + int prefer_server_ciphers; +} redisTLSContextConfig; + /*----------------------------------------------------------------------------- * Global server state *----------------------------------------------------------------------------*/ @@ -1088,6 +1107,7 @@ struct redisServer { pid_t module_child_pid; /* PID of module child */ /* Networking */ int port; /* TCP listening port */ + int tls_port; /* TLS listening port */ int tcp_backlog; /* TCP listen() backlog */ char *bindaddr[CONFIG_BINDADDR_MAX]; /* Addresses we should bind to */ int bindaddr_count; /* Number of addresses in server.bindaddr[] */ @@ -1095,6 +1115,8 @@ struct redisServer { mode_t unixsocketperm; /* UNIX socket permission */ int ipfd[CONFIG_BINDADDR_MAX]; /* TCP socket file descriptors */ int ipfd_count; /* Used slots in ipfd[] */ + int tlsfd[CONFIG_BINDADDR_MAX]; /* TLS socket file descriptors */ + int tlsfd_count; /* Used slots in tlsfd[] */ int sofd; /* Unix socket file descriptor */ int cfd[CONFIG_BINDADDR_MAX];/* Cluster bus listening socket */ int cfd_count; /* Used slots in cfd[] */ @@ -1198,6 +1220,7 @@ struct redisServer { off_t aof_rewrite_base_size; /* AOF size on latest startup or rewrite. */ off_t aof_current_size; /* AOF current size. */ off_t aof_fsync_offset; /* AOF offset which is already synced to disk. */ + int aof_flush_sleep; /* Micros to sleep before flush. (used by tests) */ int aof_rewrite_scheduled; /* Rewrite once BGSAVE terminates. */ pid_t aof_child_pid; /* PID if rewriting process */ list *aof_rewrite_buf_blocks; /* Hold changes during an AOF rewrite. */ @@ -1243,10 +1266,17 @@ struct redisServer { int rdb_child_type; /* Type of save by active child. */ int lastbgsave_status; /* C_OK or C_ERR */ int stop_writes_on_bgsave_err; /* Don't allow writes if can't BGSAVE */ - int rdb_pipe_write_result_to_parent; /* RDB pipes used to return the state */ - int rdb_pipe_read_result_from_child; /* of each slave in diskless SYNC. */ + int rdb_pipe_write; /* RDB pipes used to transfer the rdb */ + int rdb_pipe_read; /* data to the parent process in diskless repl. */ + connection **rdb_pipe_conns; /* Connections which are currently the */ + int rdb_pipe_numconns; /* target of diskless rdb fork child. */ + int rdb_pipe_numconns_writing; /* Number of rdb conns with pending writes. */ + char *rdb_pipe_buff; /* In diskless replication, this buffer holds data */ + int rdb_pipe_bufflen; /* that was read from the the rdb pipe. */ int rdb_key_save_delay; /* Delay in microseconds between keys while * writing the RDB. (for testings) */ + int key_load_delay; /* Delay in microseconds between keys while + * loading aof or rdb. (for testings) */ /* Pipe and data structures for child -> parent info sharing. */ int child_info_pipe[2]; /* Pipe used to write the child_info_data. */ struct { @@ -1299,7 +1329,7 @@ struct redisServer { off_t repl_transfer_size; /* Size of RDB to read from master during sync. */ off_t repl_transfer_read; /* Amount of RDB read from master during sync. */ off_t repl_transfer_last_fsync_off; /* Offset when we fsync-ed last time. */ - int repl_transfer_s; /* Slave -> Master SYNC socket */ + connection *repl_transfer_s; /* Slave -> Master SYNC connection */ int repl_transfer_fd; /* Slave -> Master SYNC temp file descriptor */ char *repl_transfer_tmpfile; /* Slave-> master SYNC temp file name */ time_t repl_transfer_lastio; /* Unix time of the latest read, for timeout */ @@ -1423,6 +1453,11 @@ struct redisServer { int watchdog_period; /* Software watchdog period in ms. 0 = off */ /* System hardware info */ size_t system_memory_size; /* Total memory in system as reported by OS */ + /* TLS Configuration */ + int tls_cluster; + int tls_replication; + int tls_auth_clients; + redisTLSContextConfig tls_ctx_config; }; typedef struct pubsubPattern { @@ -1570,12 +1605,12 @@ size_t redisPopcount(void *s, long count); void redisSetProcTitle(char *title); /* networking.c -- Networking and Client related operations */ -client *createClient(int fd); +client *createClient(connection *conn); void closeTimedoutClients(void); void freeClient(client *c); void freeClientAsync(client *c); void resetClient(client *c); -void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask); +void sendReplyToClient(connection *conn); void *addReplyDeferredLen(client *c); void setDeferredArrayLen(client *c, void *node, long length); void setDeferredMapLen(client *c, void *node, long length); @@ -1587,8 +1622,9 @@ void processInputBufferAndReplicate(client *c); void processGopherRequest(client *c); void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask); +void acceptTLSHandler(aeEventLoop *el, int fd, void *privdata, int mask); void acceptUnixHandler(aeEventLoop *el, int fd, void *privdata, int mask); -void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask); +void readQueryFromClient(connection *conn); void addReplyNull(client *c); void addReplyNullArray(client *c); void addReplyBool(client *c, int b); @@ -1646,7 +1682,7 @@ int handleClientsWithPendingReadsUsingThreads(void); int stopThreadedIOIfNeeded(void); int clientHasPendingReplies(client *c); void unlinkClient(client *c); -int writeToClient(int fd, client *c, int handler_installed); +int writeToClient(client *c, int handler_installed); void linkClient(client *c); void protectClient(client *c); void unprotectClient(client *c); @@ -1782,6 +1818,8 @@ void clearReplicationId2(void); void chopReplicationBacklog(void); void replicationCacheMasterUsingMyself(void); void feedReplicationBacklog(void *ptr, size_t len); +void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask); +void rdbPipeWriteHandlerConnRemoved(struct connection *conn); /* Generic persistence functions */ void startLoadingFile(FILE* fp, char* filename); @@ -1954,6 +1992,7 @@ unsigned int LRU_CLOCK(void); const char *evictPolicyToString(void); struct redisMemOverhead *getMemoryOverheadData(void); void freeMemoryOverheadData(struct redisMemOverhead *mh); +void checkChildrenDone(void); #define RESTART_SERVER_NONE 0 #define RESTART_SERVER_GRACEFULLY (1<<0) /* Do proper shutdown. */ @@ -2369,6 +2408,10 @@ void mixDigest(unsigned char *digest, void *ptr, size_t len); void xorDigest(unsigned char *digest, void *ptr, size_t len); int populateCommandTableParseFlags(struct redisCommand *c, char *strflags); +/* TLS stuff */ +void tlsInit(void); +int tlsConfigure(redisTLSContextConfig *ctx_config); + #define redisDebug(fmt, ...) \ printf("DEBUG %s:%d > " fmt "\n", __FILE__, __LINE__, __VA_ARGS__) #define redisDebugMark() \ diff --git a/src/tls.c b/src/tls.c new file mode 100644 index 000000000..5fac6902b --- /dev/null +++ b/src/tls.c @@ -0,0 +1,808 @@ +/* + * Copyright (c) 2019, Redis Labs + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "server.h" +#include "connhelpers.h" +#include "adlist.h" + +#ifdef USE_OPENSSL + +#include +#include +#include + +#define REDIS_TLS_PROTO_TLSv1 (1<<0) +#define REDIS_TLS_PROTO_TLSv1_1 (1<<1) +#define REDIS_TLS_PROTO_TLSv1_2 (1<<2) +#define REDIS_TLS_PROTO_TLSv1_3 (1<<3) + +/* Use safe defaults */ +#ifdef TLS1_3_VERSION +#define REDIS_TLS_PROTO_DEFAULT (REDIS_TLS_PROTO_TLSv1_2|REDIS_TLS_PROTO_TLSv1_3) +#else +#define REDIS_TLS_PROTO_DEFAULT (REDIS_TLS_PROTO_TLSv1_2) +#endif + +extern ConnectionType CT_Socket; + +SSL_CTX *redis_tls_ctx; + +static int parseProtocolsConfig(const char *str) { + int i, count = 0; + int protocols = 0; + + if (!str) return REDIS_TLS_PROTO_DEFAULT; + sds *tokens = sdssplitlen(str, strlen(str), " ", 1, &count); + + if (!tokens) { + serverLog(LL_WARNING, "Invalid tls-protocols configuration string"); + return -1; + } + for (i = 0; i < count; i++) { + if (!strcasecmp(tokens[i], "tlsv1")) protocols |= REDIS_TLS_PROTO_TLSv1; + else if (!strcasecmp(tokens[i], "tlsv1.1")) protocols |= REDIS_TLS_PROTO_TLSv1_1; + else if (!strcasecmp(tokens[i], "tlsv1.2")) protocols |= REDIS_TLS_PROTO_TLSv1_2; + else if (!strcasecmp(tokens[i], "tlsv1.3")) { +#ifdef TLS1_3_VERSION + protocols |= REDIS_TLS_PROTO_TLSv1_3; +#else + serverLog(LL_WARNING, "TLSv1.3 is specified in tls-protocols but not supported by OpenSSL."); + protocols = -1; + break; +#endif + } else { + serverLog(LL_WARNING, "Invalid tls-protocols specified. " + "Use a combination of 'TLSv1', 'TLSv1.1', 'TLSv1.2' and 'TLSv1.3'."); + protocols = -1; + break; + } + } + sdsfreesplitres(tokens, count); + + return protocols; +} + +/* list of connections with pending data already read from the socket, but not + * served to the reader yet. */ +static list *pending_list = NULL; + +void tlsInit(void) { + ERR_load_crypto_strings(); + SSL_load_error_strings(); + SSL_library_init(); + + if (!RAND_poll()) { + serverLog(LL_WARNING, "OpenSSL: Failed to seed random number generator."); + } + + pending_list = listCreate(); + + /* Server configuration */ + server.tls_auth_clients = 1; /* Secure by default */ +} + +/* Attempt to configure/reconfigure TLS. This operation is atomic and will + * leave the SSL_CTX unchanged if fails. + */ +int tlsConfigure(redisTLSContextConfig *ctx_config) { + char errbuf[256]; + SSL_CTX *ctx = NULL; + + if (!ctx_config->cert_file) { + serverLog(LL_WARNING, "No tls-cert-file configured!"); + goto error; + } + + if (!ctx_config->key_file) { + serverLog(LL_WARNING, "No tls-key-file configured!"); + goto error; + } + + if (!ctx_config->ca_cert_file && !ctx_config->ca_cert_dir) { + serverLog(LL_WARNING, "Either tls-ca-cert-file or tls-ca-cert-dir must be configured!"); + goto error; + } + + ctx = SSL_CTX_new(SSLv23_method()); + + SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2|SSL_OP_NO_SSLv3); + SSL_CTX_set_options(ctx, SSL_OP_SINGLE_DH_USE); + +#ifdef SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS + SSL_CTX_set_options(ctx, SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS); +#endif + + int protocols = parseProtocolsConfig(ctx_config->protocols); + if (protocols == -1) goto error; + + if (!(protocols & REDIS_TLS_PROTO_TLSv1)) + SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1); + if (!(protocols & REDIS_TLS_PROTO_TLSv1_1)) + SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1); +#ifdef SSL_OP_NO_TLSv1_2 + if (!(protocols & REDIS_TLS_PROTO_TLSv1_2)) + SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_2); +#endif +#ifdef SSL_OP_NO_TLSv1_3 + if (!(protocols & REDIS_TLS_PROTO_TLSv1_3)) + SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_3); +#endif + +#ifdef SSL_OP_NO_COMPRESSION + SSL_CTX_set_options(ctx, SSL_OP_NO_COMPRESSION); +#endif + +#ifdef SSL_OP_NO_CLIENT_RENEGOTIATION + SSL_CTX_set_options(ssl->ctx, SSL_OP_NO_CLIENT_RENEGOTIATION); +#endif + + if (ctx_config->prefer_server_ciphers) + SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE); + + SSL_CTX_set_mode(ctx, SSL_MODE_ENABLE_PARTIAL_WRITE|SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER); + SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER|SSL_VERIFY_FAIL_IF_NO_PEER_CERT, NULL); + SSL_CTX_set_ecdh_auto(ctx, 1); + + if (SSL_CTX_use_certificate_file(ctx, ctx_config->cert_file, SSL_FILETYPE_PEM) <= 0) { + ERR_error_string_n(ERR_get_error(), errbuf, sizeof(errbuf)); + serverLog(LL_WARNING, "Failed to load certificate: %s: %s", ctx_config->cert_file, errbuf); + goto error; + } + + if (SSL_CTX_use_PrivateKey_file(ctx, ctx_config->key_file, SSL_FILETYPE_PEM) <= 0) { + ERR_error_string_n(ERR_get_error(), errbuf, sizeof(errbuf)); + serverLog(LL_WARNING, "Failed to load private key: %s: %s", ctx_config->key_file, errbuf); + goto error; + } + + if (SSL_CTX_load_verify_locations(ctx, ctx_config->ca_cert_file, ctx_config->ca_cert_dir) <= 0) { + ERR_error_string_n(ERR_get_error(), errbuf, sizeof(errbuf)); + serverLog(LL_WARNING, "Failed to configure CA certificate(s) file/directory: %s", errbuf); + goto error; + } + + if (ctx_config->dh_params_file) { + FILE *dhfile = fopen(ctx_config->dh_params_file, "r"); + DH *dh = NULL; + if (!dhfile) { + serverLog(LL_WARNING, "Failed to load %s: %s", ctx_config->dh_params_file, strerror(errno)); + goto error; + } + + dh = PEM_read_DHparams(dhfile, NULL, NULL, NULL); + fclose(dhfile); + if (!dh) { + serverLog(LL_WARNING, "%s: failed to read DH params.", ctx_config->dh_params_file); + goto error; + } + + if (SSL_CTX_set_tmp_dh(ctx, dh) <= 0) { + ERR_error_string_n(ERR_get_error(), errbuf, sizeof(errbuf)); + serverLog(LL_WARNING, "Failed to load DH params file: %s: %s", ctx_config->dh_params_file, errbuf); + DH_free(dh); + goto error; + } + + DH_free(dh); + } + + if (ctx_config->ciphers && !SSL_CTX_set_cipher_list(ctx, ctx_config->ciphers)) { + serverLog(LL_WARNING, "Failed to configure ciphers: %s", ctx_config->ciphers); + goto error; + } + +#ifdef TLS1_3_VERSION + if (ctx_config->ciphersuites && !SSL_CTX_set_ciphersuites(ctx, ctx_config->ciphersuites)) { + serverLog(LL_WARNING, "Failed to configure ciphersuites: %s", ctx_config->ciphersuites); + goto error; + } +#endif + + SSL_CTX_free(redis_tls_ctx); + redis_tls_ctx = ctx; + + return C_OK; + +error: + if (ctx) SSL_CTX_free(ctx); + return C_ERR; +} + +#ifdef TLS_DEBUGGING +#define TLSCONN_DEBUG(fmt, ...) \ + serverLog(LL_DEBUG, "TLSCONN: " fmt, __VA_ARGS__) +#else +#define TLSCONN_DEBUG(fmt, ...) +#endif + +ConnectionType CT_TLS; + +/* Normal socket connections have a simple events/handler correlation. + * + * With TLS connections we need to handle cases where during a logical read + * or write operation, the SSL library asks to block for the opposite + * socket operation. + * + * When this happens, we need to do two things: + * 1. Make sure we register for the even. + * 2. Make sure we know which handler needs to execute when the + * event fires. That is, if we notify the caller of a write operation + * that it blocks, and SSL asks for a read, we need to trigger the + * write handler again on the next read event. + * + */ + +typedef enum { + WANT_READ = 1, + WANT_WRITE +} WantIOType; + +#define TLS_CONN_FLAG_READ_WANT_WRITE (1<<0) +#define TLS_CONN_FLAG_WRITE_WANT_READ (1<<1) +#define TLS_CONN_FLAG_FD_SET (1<<2) + +typedef struct tls_connection { + connection c; + int flags; + SSL *ssl; + char *ssl_error; + listNode *pending_list_node; +} tls_connection; + +connection *connCreateTLS(void) { + tls_connection *conn = zcalloc(sizeof(tls_connection)); + conn->c.type = &CT_TLS; + conn->c.fd = -1; + conn->ssl = SSL_new(redis_tls_ctx); + return (connection *) conn; +} + +connection *connCreateAcceptedTLS(int fd, int require_auth) { + tls_connection *conn = (tls_connection *) connCreateTLS(); + conn->c.fd = fd; + conn->c.state = CONN_STATE_ACCEPTING; + + if (!require_auth) { + /* We still verify certificates if provided, but don't require them. + */ + SSL_set_verify(conn->ssl, SSL_VERIFY_PEER, NULL); + } + + SSL_set_fd(conn->ssl, conn->c.fd); + SSL_set_accept_state(conn->ssl); + + return (connection *) conn; +} + +static void tlsEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask); + +/* Process the return code received from OpenSSL> + * Update the want parameter with expected I/O. + * Update the connection's error state if a real error has occured. + * Returns an SSL error code, or 0 if no further handling is required. + */ +static int handleSSLReturnCode(tls_connection *conn, int ret_value, WantIOType *want) { + if (ret_value <= 0) { + int ssl_err = SSL_get_error(conn->ssl, ret_value); + switch (ssl_err) { + case SSL_ERROR_WANT_WRITE: + *want = WANT_WRITE; + return 0; + case SSL_ERROR_WANT_READ: + *want = WANT_READ; + return 0; + case SSL_ERROR_SYSCALL: + conn->c.last_errno = errno; + if (conn->ssl_error) zfree(conn->ssl_error); + conn->ssl_error = errno ? zstrdup(strerror(errno)) : NULL; + break; + default: + /* Error! */ + conn->c.last_errno = 0; + if (conn->ssl_error) zfree(conn->ssl_error); + conn->ssl_error = zmalloc(512); + ERR_error_string_n(ERR_get_error(), conn->ssl_error, 512); + break; + } + + return ssl_err; + } + + return 0; +} + +void registerSSLEvent(tls_connection *conn, WantIOType want) { + int mask = aeGetFileEvents(server.el, conn->c.fd); + + switch (want) { + case WANT_READ: + if (mask & AE_WRITABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); + if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, + tlsEventHandler, conn); + break; + case WANT_WRITE: + if (mask & AE_READABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); + if (!(mask & AE_WRITABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, + tlsEventHandler, conn); + break; + default: + serverAssert(0); + break; + } +} + +void updateSSLEvent(tls_connection *conn) { + int mask = aeGetFileEvents(server.el, conn->c.fd); + int need_read = conn->c.read_handler || (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ); + int need_write = conn->c.write_handler || (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE); + + if (need_read && !(mask & AE_READABLE)) + aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); + if (!need_read && (mask & AE_READABLE)) + aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); + + if (need_write && !(mask & AE_WRITABLE)) + aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); + if (!need_write && (mask & AE_WRITABLE)) + aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); +} + +static void tlsHandleEvent(tls_connection *conn, int mask) { + int ret; + + TLSCONN_DEBUG("tlsEventHandler(): fd=%d, state=%d, mask=%d, r=%d, w=%d, flags=%d", + fd, conn->c.state, mask, conn->c.read_handler != NULL, conn->c.write_handler != NULL, + conn->flags); + + ERR_clear_error(); + + switch (conn->c.state) { + case CONN_STATE_CONNECTING: + if (connGetSocketError((connection *) conn)) { + conn->c.last_errno = errno; + conn->c.state = CONN_STATE_ERROR; + } else { + if (!(conn->flags & TLS_CONN_FLAG_FD_SET)) { + SSL_set_fd(conn->ssl, conn->c.fd); + conn->flags |= TLS_CONN_FLAG_FD_SET; + } + ret = SSL_connect(conn->ssl); + if (ret <= 0) { + WantIOType want = 0; + if (!handleSSLReturnCode(conn, ret, &want)) { + registerSSLEvent(conn, want); + + /* Avoid hitting UpdateSSLEvent, which knows nothing + * of what SSL_connect() wants and instead looks at our + * R/W handlers. + */ + return; + } + + /* If not handled, it's an error */ + conn->c.state = CONN_STATE_ERROR; + } else { + conn->c.state = CONN_STATE_CONNECTED; + } + } + + if (!callHandler((connection *) conn, conn->c.conn_handler)) return; + conn->c.conn_handler = NULL; + break; + case CONN_STATE_ACCEPTING: + ret = SSL_accept(conn->ssl); + if (ret <= 0) { + WantIOType want = 0; + if (!handleSSLReturnCode(conn, ret, &want)) { + /* Avoid hitting UpdateSSLEvent, which knows nothing + * of what SSL_connect() wants and instead looks at our + * R/W handlers. + */ + registerSSLEvent(conn, want); + return; + } + + /* If not handled, it's an error */ + conn->c.state = CONN_STATE_ERROR; + } else { + conn->c.state = CONN_STATE_CONNECTED; + } + + if (!callHandler((connection *) conn, conn->c.conn_handler)) return; + conn->c.conn_handler = NULL; + break; + case CONN_STATE_CONNECTED: + { + int call_read = ((mask & AE_READABLE) && conn->c.read_handler) || + ((mask & AE_WRITABLE) && (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE)); + int call_write = ((mask & AE_WRITABLE) && conn->c.write_handler) || + ((mask & AE_READABLE) && (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ)); + + /* Normally we execute the readable event first, and the writable + * event laster. This is useful as sometimes we may be able + * to serve the reply of a query immediately after processing the + * query. + * + * However if WRITE_BARRIER is set in the mask, our application is + * asking us to do the reverse: never fire the writable event + * after the readable. In such a case, we invert the calls. + * This is useful when, for instance, we want to do things + * in the beforeSleep() hook, like fsynching a file to disk, + * before replying to a client. */ + int invert = conn->c.flags & CONN_FLAG_WRITE_BARRIER; + + if (!invert && call_read) { + conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE; + if (!callHandler((connection *) conn, conn->c.read_handler)) return; + } + + /* Fire the writable event. */ + if (call_write) { + conn->flags &= ~TLS_CONN_FLAG_WRITE_WANT_READ; + if (!callHandler((connection *) conn, conn->c.write_handler)) return; + } + + /* If we have to invert the call, fire the readable event now + * after the writable one. */ + if (invert && call_read) { + conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE; + if (!callHandler((connection *) conn, conn->c.read_handler)) return; + } + + /* If SSL has pending that, already read from the socket, we're at + * risk of not calling the read handler again, make sure to add it + * to a list of pending connection that should be handled anyway. */ + if ((mask & AE_READABLE)) { + if (SSL_pending(conn->ssl) > 0) { + if (!conn->pending_list_node) { + listAddNodeTail(pending_list, conn); + conn->pending_list_node = listLast(pending_list); + } + } else if (conn->pending_list_node) { + listDelNode(pending_list, conn->pending_list_node); + conn->pending_list_node = NULL; + } + } + + break; + } + default: + break; + } + + updateSSLEvent(conn); +} + +static void tlsEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask) { + UNUSED(el); + UNUSED(fd); + tls_connection *conn = clientData; + tlsHandleEvent(conn, mask); +} + +static void connTLSClose(connection *conn_) { + tls_connection *conn = (tls_connection *) conn_; + + if (conn->ssl) { + SSL_free(conn->ssl); + conn->ssl = NULL; + } + + if (conn->ssl_error) { + zfree(conn->ssl_error); + conn->ssl_error = NULL; + } + + if (conn->pending_list_node) { + listDelNode(pending_list, conn->pending_list_node); + conn->pending_list_node = NULL; + } + + CT_Socket.close(conn_); +} + +static int connTLSAccept(connection *_conn, ConnectionCallbackFunc accept_handler) { + tls_connection *conn = (tls_connection *) _conn; + int ret; + + if (conn->c.state != CONN_STATE_ACCEPTING) return C_ERR; + ERR_clear_error(); + + /* Try to accept */ + conn->c.conn_handler = accept_handler; + ret = SSL_accept(conn->ssl); + + if (ret <= 0) { + WantIOType want = 0; + if (!handleSSLReturnCode(conn, ret, &want)) { + registerSSLEvent(conn, want); /* We'll fire back */ + return C_OK; + } else { + conn->c.state = CONN_STATE_ERROR; + return C_ERR; + } + } + + conn->c.state = CONN_STATE_CONNECTED; + if (!callHandler((connection *) conn, conn->c.conn_handler)) return C_OK; + conn->c.conn_handler = NULL; + + return C_OK; +} + +static int connTLSConnect(connection *conn_, const char *addr, int port, const char *src_addr, ConnectionCallbackFunc connect_handler) { + tls_connection *conn = (tls_connection *) conn_; + + if (conn->c.state != CONN_STATE_NONE) return C_ERR; + ERR_clear_error(); + + /* Initiate Socket connection first */ + if (CT_Socket.connect(conn_, addr, port, src_addr, connect_handler) == C_ERR) return C_ERR; + + /* Return now, once the socket is connected we'll initiate + * TLS connection from the event handler. + */ + return C_OK; +} + +static int connTLSWrite(connection *conn_, const void *data, size_t data_len) { + tls_connection *conn = (tls_connection *) conn_; + int ret, ssl_err; + + if (conn->c.state != CONN_STATE_CONNECTED) return -1; + ERR_clear_error(); + ret = SSL_write(conn->ssl, data, data_len); + + if (ret <= 0) { + WantIOType want = 0; + if (!(ssl_err = handleSSLReturnCode(conn, ret, &want))) { + if (want == WANT_READ) conn->flags |= TLS_CONN_FLAG_WRITE_WANT_READ; + updateSSLEvent(conn); + errno = EAGAIN; + return -1; + } else { + if (ssl_err == SSL_ERROR_ZERO_RETURN || + ((ssl_err == SSL_ERROR_SYSCALL && !errno))) { + conn->c.state = CONN_STATE_CLOSED; + return 0; + } else { + conn->c.state = CONN_STATE_ERROR; + return -1; + } + } + } + + return ret; +} + +static int connTLSRead(connection *conn_, void *buf, size_t buf_len) { + tls_connection *conn = (tls_connection *) conn_; + int ret; + int ssl_err; + + if (conn->c.state != CONN_STATE_CONNECTED) return -1; + ERR_clear_error(); + ret = SSL_read(conn->ssl, buf, buf_len); + if (ret <= 0) { + WantIOType want = 0; + if (!(ssl_err = handleSSLReturnCode(conn, ret, &want))) { + if (want == WANT_WRITE) conn->flags |= TLS_CONN_FLAG_READ_WANT_WRITE; + updateSSLEvent(conn); + + errno = EAGAIN; + return -1; + } else { + if (ssl_err == SSL_ERROR_ZERO_RETURN || + ((ssl_err == SSL_ERROR_SYSCALL) && !errno)) { + conn->c.state = CONN_STATE_CLOSED; + return 0; + } else { + conn->c.state = CONN_STATE_ERROR; + return -1; + } + } + } + + return ret; +} + +static const char *connTLSGetLastError(connection *conn_) { + tls_connection *conn = (tls_connection *) conn_; + + if (conn->ssl_error) return conn->ssl_error; + return NULL; +} + +int connTLSSetWriteHandler(connection *conn, ConnectionCallbackFunc func, int barrier) { + conn->write_handler = func; + if (barrier) + conn->flags |= CONN_FLAG_WRITE_BARRIER; + else + conn->flags &= ~CONN_FLAG_WRITE_BARRIER; + updateSSLEvent((tls_connection *) conn); + return C_OK; +} + +int connTLSSetReadHandler(connection *conn, ConnectionCallbackFunc func) { + conn->read_handler = func; + updateSSLEvent((tls_connection *) conn); + return C_OK; +} + +static void setBlockingTimeout(tls_connection *conn, long long timeout) { + anetBlock(NULL, conn->c.fd); + anetSendTimeout(NULL, conn->c.fd, timeout); + anetRecvTimeout(NULL, conn->c.fd, timeout); +} + +static void unsetBlockingTimeout(tls_connection *conn) { + anetNonBlock(NULL, conn->c.fd); + anetSendTimeout(NULL, conn->c.fd, 0); + anetRecvTimeout(NULL, conn->c.fd, 0); +} + +static int connTLSBlockingConnect(connection *conn_, const char *addr, int port, long long timeout) { + tls_connection *conn = (tls_connection *) conn_; + int ret; + + if (conn->c.state != CONN_STATE_NONE) return C_ERR; + + /* Initiate socket blocking connect first */ + if (CT_Socket.blocking_connect(conn_, addr, port, timeout) == C_ERR) return C_ERR; + + /* Initiate TLS connection now. We set up a send/recv timeout on the socket, + * which means the specified timeout will not be enforced accurately. */ + SSL_set_fd(conn->ssl, conn->c.fd); + setBlockingTimeout(conn, timeout); + + if ((ret = SSL_connect(conn->ssl)) <= 0) { + conn->c.state = CONN_STATE_ERROR; + return C_ERR; + } + unsetBlockingTimeout(conn); + + conn->c.state = CONN_STATE_CONNECTED; + return C_OK; +} + +static ssize_t connTLSSyncWrite(connection *conn_, char *ptr, ssize_t size, long long timeout) { + tls_connection *conn = (tls_connection *) conn_; + + setBlockingTimeout(conn, timeout); + SSL_clear_mode(conn->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE); + int ret = SSL_write(conn->ssl, ptr, size); + SSL_set_mode(conn->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE); + unsetBlockingTimeout(conn); + + return ret; +} + +static ssize_t connTLSSyncRead(connection *conn_, char *ptr, ssize_t size, long long timeout) { + tls_connection *conn = (tls_connection *) conn_; + + setBlockingTimeout(conn, timeout); + int ret = SSL_read(conn->ssl, ptr, size); + unsetBlockingTimeout(conn); + + return ret; +} + +static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, long long timeout) { + tls_connection *conn = (tls_connection *) conn_; + ssize_t nread = 0; + + setBlockingTimeout(conn, timeout); + + size--; + while(size) { + char c; + + if (SSL_read(conn->ssl,&c,1) <= 0) { + nread = -1; + goto exit; + } + if (c == '\n') { + *ptr = '\0'; + if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0'; + goto exit; + } else { + *ptr++ = c; + *ptr = '\0'; + nread++; + } + size--; + } +exit: + unsetBlockingTimeout(conn); + return nread; +} + +ConnectionType CT_TLS = { + .ae_handler = tlsEventHandler, + .accept = connTLSAccept, + .connect = connTLSConnect, + .blocking_connect = connTLSBlockingConnect, + .read = connTLSRead, + .write = connTLSWrite, + .close = connTLSClose, + .set_write_handler = connTLSSetWriteHandler, + .set_read_handler = connTLSSetReadHandler, + .get_last_error = connTLSGetLastError, + .sync_write = connTLSSyncWrite, + .sync_read = connTLSSyncRead, + .sync_readline = connTLSSyncReadLine, +}; + +int tlsHasPendingData() { + if (!pending_list) + return 0; + return listLength(pending_list) > 0; +} + +void tlsProcessPendingData() { + listIter li; + listNode *ln; + + listRewind(pending_list,&li); + while((ln = listNext(&li))) { + tls_connection *conn = listNodeValue(ln); + tlsHandleEvent(conn, AE_READABLE); + } +} + +#else /* USE_OPENSSL */ + +void tlsInit(void) { +} + +int tlsConfigure(redisTLSContextConfig *ctx_config) { + UNUSED(ctx_config); + return C_OK; +} + +connection *connCreateTLS(void) { + return NULL; +} + +connection *connCreateAcceptedTLS(int fd, int require_auth) { + UNUSED(fd); + UNUSED(require_auth); + + return NULL; +} + +int tlsHasPendingData() { + return 0; +} + +void tlsProcessPendingData() { +} + +#endif diff --git a/tests/cluster/run.tcl b/tests/cluster/run.tcl index 93603ddc9..d9a7d7ee5 100644 --- a/tests/cluster/run.tcl +++ b/tests/cluster/run.tcl @@ -8,6 +8,7 @@ source ../instances.tcl source ../../support/cluster.tcl ; # Redis Cluster client. set ::instances_count 20 ; # How many instances we use at max. +set ::tlsdir "../../tls" proc main {} { parse_options diff --git a/tests/cluster/tests/04-resharding.tcl b/tests/cluster/tests/04-resharding.tcl index 68fba135e..33f861dc5 100644 --- a/tests/cluster/tests/04-resharding.tcl +++ b/tests/cluster/tests/04-resharding.tcl @@ -4,6 +4,7 @@ # are preseved across iterations. source "../tests/includes/init-tests.tcl" +source "../../../tests/support/cli.tcl" test "Create a 5 nodes cluster" { create_cluster 5 5 @@ -79,6 +80,7 @@ test "Cluster consistency during live resharding" { --cluster-to $target \ --cluster-slots 100 \ --cluster-yes \ + {*}[rediscli_tls_config "../../../tests"] \ | [info nameofexecutable] \ ../tests/helpers/onlydots.tcl \ &] 0] diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl index 3d8b7b04b..dd18a979a 100644 --- a/tests/cluster/tests/12-replica-migration-2.tcl +++ b/tests/cluster/tests/12-replica-migration-2.tcl @@ -5,6 +5,7 @@ # other masters have slaves. source "../tests/includes/init-tests.tcl" +source "../../../tests/support/cli.tcl" # Create a cluster with 5 master and 15 slaves, to make sure there are no # empty masters and make rebalancing simpler to handle during the test. @@ -33,7 +34,9 @@ test "Resharding all the master #0 slots away from it" { set output [exec \ ../../../src/redis-cli --cluster rebalance \ 127.0.0.1:[get_instance_attrib redis 0 port] \ + {*}[rediscli_tls_config "../../../tests"] \ --cluster-weight ${master0_id}=0 >@ stdout ] + } test "Master #0 should lose its replicas" { @@ -51,6 +54,7 @@ test "Resharding back some slot to master #0" { set output [exec \ ../../../src/redis-cli --cluster rebalance \ 127.0.0.1:[get_instance_attrib redis 0 port] \ + {*}[rediscli_tls_config "../../../tests"] \ --cluster-weight ${master0_id}=.01 \ --cluster-use-empty-masters >@ stdout] } diff --git a/tests/helpers/bg_block_op.tcl b/tests/helpers/bg_block_op.tcl index 238d3874f..c8b323308 100644 --- a/tests/helpers/bg_block_op.tcl +++ b/tests/helpers/bg_block_op.tcl @@ -1,6 +1,8 @@ source tests/support/redis.tcl source tests/support/util.tcl +set ::tlsdir "tests/tls" + # This function sometimes writes sometimes blocking-reads from lists/sorted # sets. There are multiple processes like this executing at the same time # so that we have some chance to trap some corner condition if there is @@ -8,8 +10,8 @@ source tests/support/util.tcl # space to just a few elements, and balance the operations so that it is # unlikely that lists and zsets just get more data without ever causing # blocking. -proc bg_block_op {host port db ops} { - set r [redis $host $port] +proc bg_block_op {host port db ops tls} { + set r [redis $host $port 0 $tls] $r select $db for {set j 0} {$j < $ops} {incr j} { @@ -49,4 +51,4 @@ proc bg_block_op {host port db ops} { } } -bg_block_op [lindex $argv 0] [lindex $argv 1] [lindex $argv 2] [lindex $argv 3] +bg_block_op [lindex $argv 0] [lindex $argv 1] [lindex $argv 2] [lindex $argv 3] [lindex $argv 4] diff --git a/tests/helpers/bg_complex_data.tcl b/tests/helpers/bg_complex_data.tcl index dffd7c668..e888748a7 100644 --- a/tests/helpers/bg_complex_data.tcl +++ b/tests/helpers/bg_complex_data.tcl @@ -1,10 +1,12 @@ source tests/support/redis.tcl source tests/support/util.tcl -proc bg_complex_data {host port db ops} { - set r [redis $host $port] +set ::tlsdir "tests/tls" + +proc bg_complex_data {host port db ops tls} { + set r [redis $host $port 0 $tls] $r select $db createComplexDataset $r $ops } -bg_complex_data [lindex $argv 0] [lindex $argv 1] [lindex $argv 2] [lindex $argv 3] +bg_complex_data [lindex $argv 0] [lindex $argv 1] [lindex $argv 2] [lindex $argv 3] [lindex $argv 4] diff --git a/tests/helpers/gen_write_load.tcl b/tests/helpers/gen_write_load.tcl index 6d1a34516..fd6aad40c 100644 --- a/tests/helpers/gen_write_load.tcl +++ b/tests/helpers/gen_write_load.tcl @@ -1,8 +1,10 @@ source tests/support/redis.tcl -proc gen_write_load {host port seconds} { +set ::tlsdir "tests/tls" + +proc gen_write_load {host port seconds tls} { set start_time [clock seconds] - set r [redis $host $port 1] + set r [redis $host $port 0 $tls] $r select 9 while 1 { $r set [expr rand()] [expr rand()] @@ -12,4 +14,4 @@ proc gen_write_load {host port seconds} { } } -gen_write_load [lindex $argv 0] [lindex $argv 1] [lindex $argv 2] +gen_write_load [lindex $argv 0] [lindex $argv 1] [lindex $argv 2] [lindex $argv 3] diff --git a/tests/instances.tcl b/tests/instances.tcl index 357b34818..0a0cbab12 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -17,6 +17,7 @@ source ../support/test.tcl set ::verbose 0 set ::valgrind 0 +set ::tls 0 set ::pause_on_error 0 set ::simulate_error 0 set ::failed 0 @@ -69,7 +70,19 @@ proc spawn_instance {type base_port count {conf {}}} { # Write the instance config file. set cfgfile [file join $dirname $type.conf] set cfg [open $cfgfile w] - puts $cfg "port $port" + if {$::tls} { + puts $cfg "tls-port $port" + puts $cfg "tls-replication yes" + puts $cfg "tls-cluster yes" + puts $cfg "port 0" + puts $cfg [format "tls-cert-file %s/../../tls/redis.crt" [pwd]] + puts $cfg [format "tls-key-file %s/../../tls/redis.key" [pwd]] + puts $cfg [format "tls-dh-params-file %s/../../tls/redis.dh" [pwd]] + puts $cfg [format "tls-ca-cert-file %s/../../tls/ca.crt" [pwd]] + puts $cfg "loglevel debug" + } else { + puts $cfg "port $port" + } puts $cfg "dir ./$dirname" puts $cfg "logfile log.txt" # Add additional config files @@ -88,7 +101,7 @@ proc spawn_instance {type base_port count {conf {}}} { } # Push the instance into the right list - set link [redis 127.0.0.1 $port] + set link [redis 127.0.0.1 $port 0 $::tls] $link reconnect 1 lappend ::${type}_instances [list \ pid $pid \ @@ -148,6 +161,13 @@ proc parse_options {} { set ::simulate_error 1 } elseif {$opt eq {--valgrind}} { set ::valgrind 1 + } elseif {$opt eq {--tls}} { + package require tls 1.6 + ::tls::init \ + -cafile "$::tlsdir/ca.crt" \ + -certfile "$::tlsdir/redis.crt" \ + -keyfile "$::tlsdir/redis.key" + set ::tls 1 } elseif {$opt eq "--help"} { puts "Hello, I'm sentinel.tcl and I run Sentinel unit tests." puts "\nOptions:" @@ -492,7 +512,7 @@ proc restart_instance {type id} { } # Connect with it with a fresh link - set link [redis 127.0.0.1 $port] + set link [redis 127.0.0.1 $port 0 $::tls] $link reconnect 1 set_instance_attrib $type $id link $link diff --git a/tests/integration/aof-race.tcl b/tests/integration/aof-race.tcl index fb8d71083..2991e7962 100644 --- a/tests/integration/aof-race.tcl +++ b/tests/integration/aof-race.tcl @@ -13,8 +13,9 @@ tags {"aof"} { # cleaned after a child responsible for an AOF rewrite exited. This buffer # was subsequently appended to the new AOF, resulting in duplicate commands. start_server_aof [list dir $server_path] { - set client [redis [srv host] [srv port]] - set bench [open "|src/redis-benchmark -q -p [srv port] -c 20 -n 20000 incr foo" "r+"] + set client [redis [srv host] [srv port] 0 $::tls] + set bench [open "|src/redis-benchmark -q -s [srv unixsocket] -c 20 -n 20000 incr foo" "r+"] + after 100 # Benchmark should be running by now: start background rewrite @@ -29,7 +30,7 @@ tags {"aof"} { # Restart server to replay AOF start_server_aof [list dir $server_path] { - set client [redis [srv host] [srv port]] + set client [redis [srv host] [srv port] 0 $::tls] assert_equal 20000 [$client get foo] } } diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index e397faeeb..2734de7f1 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -52,7 +52,7 @@ tags {"aof"} { assert_equal 1 [is_alive $srv] } - set client [redis [dict get $srv host] [dict get $srv port]] + set client [redis [dict get $srv host] [dict get $srv port] 0 $::tls] test "Truncated AOF loaded: we expect foo to be equal to 5" { assert {[$client get foo] eq "5"} @@ -69,7 +69,7 @@ tags {"aof"} { assert_equal 1 [is_alive $srv] } - set client [redis [dict get $srv host] [dict get $srv port]] + set client [redis [dict get $srv host] [dict get $srv port] 0 $::tls] test "Truncated AOF loaded: we expect foo to be equal to 6 now" { assert {[$client get foo] eq "6"} @@ -170,7 +170,7 @@ tags {"aof"} { } test "Fixed AOF: Keyspace should contain values that were parseable" { - set client [redis [dict get $srv host] [dict get $srv port]] + set client [redis [dict get $srv host] [dict get $srv port] 0 $::tls] wait_for_condition 50 100 { [catch {$client ping} e] == 0 } else { @@ -194,7 +194,7 @@ tags {"aof"} { } test "AOF+SPOP: Set should have 1 member" { - set client [redis [dict get $srv host] [dict get $srv port]] + set client [redis [dict get $srv host] [dict get $srv port] 0 $::tls] wait_for_condition 50 100 { [catch {$client ping} e] == 0 } else { @@ -218,7 +218,7 @@ tags {"aof"} { } test "AOF+SPOP: Set should have 1 member" { - set client [redis [dict get $srv host] [dict get $srv port]] + set client [redis [dict get $srv host] [dict get $srv port] 0 $::tls] wait_for_condition 50 100 { [catch {$client ping} e] == 0 } else { @@ -241,7 +241,7 @@ tags {"aof"} { } test "AOF+EXPIRE: List should be empty" { - set client [redis [dict get $srv host] [dict get $srv port]] + set client [redis [dict get $srv host] [dict get $srv port] 0 $::tls] wait_for_condition 50 100 { [catch {$client ping} e] == 0 } else { @@ -257,4 +257,35 @@ tags {"aof"} { r expire x -1 } } + + start_server {overrides {appendonly {yes} appendfilename {appendonly.aof} appendfsync always}} { + test {AOF fsync always barrier issue} { + set rd [redis_deferring_client] + # Set a sleep when aof is flushed, so that we have a chance to look + # at the aof size and detect if the response of an incr command + # arrives before the data was written (and hopefully fsynced) + # We create a big reply, which will hopefully not have room in the + # socket buffers, and will install a write handler, then we sleep + # a big and issue the incr command, hoping that the last portion of + # the output buffer write, and the processing of the incr will happen + # in the same event loop cycle. + # Since the socket buffers and timing are unpredictable, we fuzz this + # test with slightly different sizes and sleeps a few times. + for {set i 0} {$i < 10} {incr i} { + r debug aof-flush-sleep 0 + r del x + r setrange x [expr {int(rand()*5000000)+10000000}] x + r debug aof-flush-sleep 500000 + set aof [file join [lindex [r config get dir] 1] appendonly.aof] + set size1 [file size $aof] + $rd get x + after [expr {int(rand()*30)}] + $rd incr new_value + $rd read + $rd read + set size2 [file size $aof] + assert {$size1 != $size2} + } + } + } } diff --git a/tests/integration/block-repl.tcl b/tests/integration/block-repl.tcl index c111b805b..07eceb228 100644 --- a/tests/integration/block-repl.tcl +++ b/tests/integration/block-repl.tcl @@ -2,9 +2,9 @@ # Unlike stream operations such operations are "pop" style, so they consume # the list or sorted set, and must be replicated correctly. -proc start_bg_block_op {host port db ops} { +proc start_bg_block_op {host port db ops tls} { set tclsh [info nameofexecutable] - exec $tclsh tests/helpers/bg_block_op.tcl $host $port $db $ops & + exec $tclsh tests/helpers/bg_block_op.tcl $host $port $db $ops $tls & } proc stop_bg_block_op {handle} { @@ -18,9 +18,9 @@ start_server {tags {"repl"}} { set master_port [srv -1 port] set slave [srv 0 client] - set load_handle0 [start_bg_block_op $master_host $master_port 9 100000] - set load_handle1 [start_bg_block_op $master_host $master_port 9 100000] - set load_handle2 [start_bg_block_op $master_host $master_port 9 100000] + set load_handle0 [start_bg_block_op $master_host $master_port 9 100000 $::tls] + set load_handle1 [start_bg_block_op $master_host $master_port 9 100000 $::tls] + set load_handle2 [start_bg_block_op $master_host $master_port 9 100000 $::tls] test {First server should have role slave after SLAVEOF} { $slave slaveof $master_host $master_port diff --git a/tests/integration/psync2-reg.tcl b/tests/integration/psync2-reg.tcl index 3d408368e..b5ad021e2 100644 --- a/tests/integration/psync2-reg.tcl +++ b/tests/integration/psync2-reg.tcl @@ -18,6 +18,7 @@ start_server {} { set R($j) [srv [expr 0-$j] client] set R_host($j) [srv [expr 0-$j] host] set R_port($j) [srv [expr 0-$j] port] + set R_unixsocket($j) [srv [expr 0-$j] unixsocket] if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"} } @@ -36,7 +37,7 @@ start_server {} { } set cycle_start_time [clock milliseconds] - set bench_pid [exec src/redis-benchmark -p $R_port(0) -n 10000000 -r 1000 incr __rand_int__ > /dev/null &] + set bench_pid [exec src/redis-benchmark -s $R_unixsocket(0) -n 10000000 -r 1000 incr __rand_int__ > /dev/null &] while 1 { set elapsed [expr {[clock milliseconds]-$cycle_start_time}] if {$elapsed > $duration*1000} break diff --git a/tests/integration/redis-cli.tcl b/tests/integration/redis-cli.tcl index 40e4222e3..5d1635950 100644 --- a/tests/integration/redis-cli.tcl +++ b/tests/integration/redis-cli.tcl @@ -1,7 +1,10 @@ +source tests/support/cli.tcl + start_server {tags {"cli"}} { proc open_cli {} { set ::env(TERM) dumb - set fd [open [format "|src/redis-cli -p %d -n 9" [srv port]] "r+"] + set cmdline [rediscli [srv port] "-n 9"] + set fd [open "|$cmdline" "r+"] fconfigure $fd -buffering none fconfigure $fd -blocking false fconfigure $fd -translation binary @@ -54,8 +57,8 @@ start_server {tags {"cli"}} { } proc _run_cli {opts args} { - set cmd [format "src/redis-cli -p %d -n 9 $args" [srv port]] - foreach {key value} $opts { + set cmd [rediscli [srv port] [list -n 9 {*}$args]] + foreach {key value} $args { if {$key eq "pipe"} { set cmd "sh -c \"$value | $cmd\"" } diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index 1c18582c5..4bd1f47f7 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -466,3 +466,167 @@ test {diskless loading short read} { } } +# get current stime and utime metrics for a thread (since it's creation) +proc get_cpu_metrics { statfile } { + if { [ catch { + set fid [ open $statfile r ] + set data [ read $fid 1024 ] + ::close $fid + set data [ split $data ] + + ;## number of jiffies it has been scheduled... + set utime [ lindex $data 13 ] + set stime [ lindex $data 14 ] + } err ] } { + error "assertion:can't parse /proc: $err" + } + set mstime [clock milliseconds] + return [ list $mstime $utime $stime ] +} + +# compute %utime and %stime of a thread between two measurements +proc compute_cpu_usage {start end} { + set clock_ticks [exec getconf CLK_TCK] + # convert ms time to jiffies and calc delta + set dtime [ expr { ([lindex $end 0] - [lindex $start 0]) * double($clock_ticks) / 1000 } ] + set utime [ expr { [lindex $end 1] - [lindex $start 1] } ] + set stime [ expr { [lindex $end 2] - [lindex $start 2] } ] + set pucpu [ expr { ($utime / $dtime) * 100 } ] + set pscpu [ expr { ($stime / $dtime) * 100 } ] + return [ list $pucpu $pscpu ] +} + + +# test diskless rdb pipe with multiple replicas, which may drop half way +start_server {tags {"repl"}} { + set master [srv 0 client] + $master config set repl-diskless-sync yes + $master config set repl-diskless-sync-delay 1 + set master_host [srv 0 host] + set master_port [srv 0 port] + set master_pid [srv 0 pid] + # put enough data in the db that the rdb file will be bigger than the socket buffers + # and since we'll have key-load-delay of 100, 10000 keys will take at least 1 second + # we also need the replica to process requests during transfer (which it does only once in 2mb) + $master debug populate 10000 test 10000 + $master config set rdbcompression no + # If running on Linux, we also measure utime/stime to detect possible I/O handling issues + set os [catch {exec unamee}] + set measure_time [expr {$os == "Linux"} ? 1 : 0] + foreach all_drop {no slow fast all} { + test "diskless $all_drop replicas drop during rdb pipe" { + set replicas {} + set replicas_alive {} + # start one replica that will read the rdb fast, and one that will be slow + start_server {} { + lappend replicas [srv 0 client] + lappend replicas_alive [srv 0 client] + start_server {} { + lappend replicas [srv 0 client] + lappend replicas_alive [srv 0 client] + + # start replication + # it's enough for just one replica to be slow, and have it's write handler enabled + # so that the whole rdb generation process is bound to that + [lindex $replicas 0] config set repl-diskless-load swapdb + [lindex $replicas 0] config set key-load-delay 100 + [lindex $replicas 0] replicaof $master_host $master_port + [lindex $replicas 1] replicaof $master_host $master_port + + # wait for the replicas to start reading the rdb + # using the log file since the replica only responds to INFO once in 2mb + wait_for_log_message -1 "*Loading DB in memory*" 8 800 10 + + if {$measure_time} { + set master_statfile "/proc/$master_pid/stat" + set master_start_metrics [get_cpu_metrics $master_statfile] + set start_time [clock seconds] + } + + # wait a while so that the pipe socket writer will be + # blocked on write (since replica 0 is slow to read from the socket) + after 500 + + # add some command to be present in the command stream after the rdb. + $master incr $all_drop + + # disconnect replicas depending on the current test + if {$all_drop == "all" || $all_drop == "fast"} { + exec kill [srv 0 pid] + set replicas_alive [lreplace $replicas_alive 1 1] + } + if {$all_drop == "all" || $all_drop == "slow"} { + exec kill [srv -1 pid] + set replicas_alive [lreplace $replicas_alive 0 0] + } + + # wait for rdb child to exit + wait_for_condition 500 100 { + [s -2 rdb_bgsave_in_progress] == 0 + } else { + fail "rdb child didn't terminate" + } + + # make sure we got what we were aiming for, by looking for the message in the log file + if {$all_drop == "all"} { + wait_for_log_message -2 "*Diskless rdb transfer, last replica dropped, killing fork child*" 12 1 1 + } + if {$all_drop == "no"} { + wait_for_log_message -2 "*Diskless rdb transfer, done reading from pipe, 2 replicas still up*" 12 1 1 + } + if {$all_drop == "slow" || $all_drop == "fast"} { + wait_for_log_message -2 "*Diskless rdb transfer, done reading from pipe, 1 replicas still up*" 12 1 1 + } + + # make sure we don't have a busy loop going thought epoll_wait + if {$measure_time} { + set master_end_metrics [get_cpu_metrics $master_statfile] + set time_elapsed [expr {[clock seconds]-$start_time}] + set master_cpu [compute_cpu_usage $master_start_metrics $master_end_metrics] + set master_utime [lindex $master_cpu 0] + set master_stime [lindex $master_cpu 1] + if {$::verbose} { + puts "elapsed: $time_elapsed" + puts "master utime: $master_utime" + puts "master stime: $master_stime" + } + if {$all_drop == "all" || $all_drop == "slow"} { + assert {$master_utime < 70} + assert {$master_stime < 70} + } + if {$all_drop == "none" || $all_drop == "fast"} { + assert {$master_utime < 15} + assert {$master_stime < 15} + } + } + + # verify the data integrity + foreach replica $replicas_alive { + # Wait that replicas acknowledge they are online so + # we are sure that DBSIZE and DEBUG DIGEST will not + # fail because of timing issues. + wait_for_condition 50 100 { + [lindex [$replica role] 3] eq {connected} + } else { + fail "replicas still not connected after some time" + } + + # Make sure that replicas and master have same + # number of keys + wait_for_condition 50 100 { + [$master dbsize] == [$replica dbsize] + } else { + fail "Different number of keys between master and replicas after too long time." + } + + # Check digests + set digest [$master debug digest] + set digest0 [$replica debug digest] + assert {$digest ne 0000000000000000000000000000000000000000} + assert {$digest eq $digest0} + } + } + } + } + } +} diff --git a/tests/sentinel/run.tcl b/tests/sentinel/run.tcl index 9a2fcfb49..996af906a 100644 --- a/tests/sentinel/run.tcl +++ b/tests/sentinel/run.tcl @@ -6,6 +6,7 @@ cd tests/sentinel source ../instances.tcl set ::instances_count 5 ; # How many instances we use at max. +set ::tlsdir "../../tls" proc main {} { parse_options diff --git a/tests/sentinel/tests/07-down-conditions.tcl b/tests/sentinel/tests/07-down-conditions.tcl index fb2993b6f..a12ea3151 100644 --- a/tests/sentinel/tests/07-down-conditions.tcl +++ b/tests/sentinel/tests/07-down-conditions.tcl @@ -1,6 +1,7 @@ # Test conditions where an instance is considered to be down source "../tests/includes/init-tests.tcl" +source "../../../tests/support/cli.tcl" proc ensure_master_up {} { wait_for_condition 1000 50 { @@ -28,7 +29,7 @@ test "Crash the majority of Sentinels to prevent failovers for this unit" { test "SDOWN is triggered by non-responding but not crashed instance" { lassign [S 4 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] host port ensure_master_up - exec ../../../src/redis-cli -h $host -p $port debug sleep 10 > /dev/null & + exec ../../../src/redis-cli -h $host -p $port {*}[rediscli_tls_config "../../../tests"] debug sleep 10 > /dev/null & ensure_master_down ensure_master_up } diff --git a/tests/support/cli.tcl b/tests/support/cli.tcl new file mode 100644 index 000000000..37c902a50 --- /dev/null +++ b/tests/support/cli.tcl @@ -0,0 +1,19 @@ +proc rediscli_tls_config {testsdir} { + set tlsdir [file join $testsdir tls] + set cert [file join $tlsdir redis.crt] + set key [file join $tlsdir redis.key] + set cacert [file join $tlsdir ca.crt] + + if {$::tls} { + return [list --tls --cert $cert --key $key --cacert $cacert] + } else { + return {} + } +} + +proc rediscli {port {opts {}}} { + set cmd [list src/redis-cli -p $port] + lappend cmd {*}[rediscli_tls_config "tests"] + lappend cmd {*}$opts + return $cmd +} diff --git a/tests/support/cluster.tcl b/tests/support/cluster.tcl index 1576053b4..74587e1f7 100644 --- a/tests/support/cluster.tcl +++ b/tests/support/cluster.tcl @@ -62,7 +62,7 @@ proc ::redis_cluster::__method__refresh_nodes_map {id} { lassign [split $ip_port :] start_host start_port if {[catch { set r {} - set r [redis $start_host $start_port] + set r [redis $start_host $start_port 0 $::tls] set nodes_descr [$r cluster nodes] $r close } e]} { @@ -107,7 +107,7 @@ proc ::redis_cluster::__method__refresh_nodes_map {id} { # Connect to the node set link {} - catch {set link [redis $host $port]} + catch {set link [redis $host $port 0 $::tls]} # Build this node description as an hash. set node [dict create \ diff --git a/tests/support/redis.tcl b/tests/support/redis.tcl index cd8ae3a34..a90ac7f29 100644 --- a/tests/support/redis.tcl +++ b/tests/support/redis.tcl @@ -39,8 +39,18 @@ array set ::redis::callback {} array set ::redis::state {} ;# State in non-blocking reply reading array set ::redis::statestack {} ;# Stack of states, for nested mbulks -proc redis {{server 127.0.0.1} {port 6379} {defer 0}} { - set fd [socket $server $port] +proc redis {{server 127.0.0.1} {port 6379} {defer 0} {tls 0} {tlsoptions {}}} { + if {$tls} { + package require tls + ::tls::init \ + -cafile "$::tlsdir/ca.crt" \ + -certfile "$::tlsdir/redis.crt" \ + -keyfile "$::tlsdir/redis.key" \ + {*}$tlsoptions + set fd [::tls::socket $server $port] + } else { + set fd [socket $server $port] + } fconfigure $fd -translation binary set id [incr ::redis::id] set ::redis::fd($id) $fd @@ -48,6 +58,7 @@ proc redis {{server 127.0.0.1} {port 6379} {defer 0}} { set ::redis::blocking($id) 1 set ::redis::deferred($id) $defer set ::redis::reconnect($id) 0 + set ::redis::tls $tls ::redis::redis_reset_state $id interp alias {} ::redis::redisHandle$id {} ::redis::__dispatch__ $id } @@ -72,7 +83,11 @@ proc ::redis::__dispatch__raw__ {id method argv} { # Reconnect the link if needed. if {$fd eq {}} { lassign $::redis::addr($id) host port - set ::redis::fd($id) [socket $host $port] + if {$::redis::tls} { + set ::redis::fd($id) [::tls::socket $host $port] + } else { + set ::redis::fd($id) [socket $host $port] + } fconfigure $::redis::fd($id) -translation binary set fd $::redis::fd($id) } diff --git a/tests/support/server.tcl b/tests/support/server.tcl index 0edb25d8a..b20f1ad36 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -92,7 +92,11 @@ proc is_alive config { proc ping_server {host port} { set retval 0 if {[catch { - set fd [socket $host $port] + if {$::tls} { + set fd [::tls::socket $host $port] + } else { + set fd [socket $host $port] + } fconfigure $fd -translation binary puts $fd "PING\r\n" flush $fd @@ -136,7 +140,6 @@ proc tags {tags code} { uplevel 1 $code set ::tags [lrange $::tags 0 end-[llength $tags]] } - proc start_server {options {code undefined}} { # If we are running against an external server, we just push the # host/port pair in the stack the first time @@ -145,7 +148,7 @@ proc start_server {options {code undefined}} { set srv {} dict set srv "host" $::host dict set srv "port" $::port - set client [redis $::host $::port] + set client [redis $::host $::port 0 $::tls] dict set srv "client" $client $client select 9 @@ -178,6 +181,13 @@ proc start_server {options {code undefined}} { set data [split [exec cat "tests/assets/$baseconfig"] "\n"] set config {} + if {$::tls} { + dict set config "tls-cert-file" [format "%s/tests/tls/redis.crt" [pwd]] + dict set config "tls-key-file" [format "%s/tests/tls/redis.key" [pwd]] + dict set config "tls-dh-params-file" [format "%s/tests/tls/redis.dh" [pwd]] + dict set config "tls-ca-cert-file" [format "%s/tests/tls/ca.crt" [pwd]] + dict set config "loglevel" "debug" + } foreach line $data { if {[string length $line] > 0 && [string index $line 0] ne "#"} { set elements [split $line " "] @@ -192,7 +202,17 @@ proc start_server {options {code undefined}} { # start every server on a different port set ::port [find_available_port [expr {$::port+1}]] - dict set config port $::port + if {$::tls} { + dict set config "port" 0 + dict set config "tls-port" $::port + dict set config "tls-cluster" "yes" + dict set config "tls-replication" "yes" + } else { + dict set config port $::port + } + + set unixsocket [file normalize [format "%s/%s" [dict get $config "dir"] "socket"]] + dict set config "unixsocket" $unixsocket # apply overrides from global space and arguments foreach {directive arguments} [concat $::global_overrides $overrides] { @@ -254,10 +274,11 @@ proc start_server {options {code undefined}} { } # setup properties to be able to initialize a client object + set port_param [expr $::tls ? {"tls-port"} : {"port"}] set host $::host set port $::port if {[dict exists $config bind]} { set host [dict get $config bind] } - if {[dict exists $config port]} { set port [dict get $config port] } + if {[dict exists $config $port_param]} { set port [dict get $config $port_param] } # setup config dict dict set srv "config_file" $config_file @@ -267,6 +288,7 @@ proc start_server {options {code undefined}} { dict set srv "port" $port dict set srv "stdout" $stdout dict set srv "stderr" $stderr + dict set srv "unixsocket" $unixsocket # if a block of code is supplied, we wait for the server to become # available, create a client object and kill the server afterwards diff --git a/tests/support/util.tcl b/tests/support/util.tcl index c2e76afad..7ecf5b79c 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -395,7 +395,7 @@ proc colorstr {color str} { # of seconds to the specified Redis instance. proc start_write_load {host port seconds} { set tclsh [info nameofexecutable] - exec $tclsh tests/helpers/gen_write_load.tcl $host $port $seconds & + exec $tclsh tests/helpers/gen_write_load.tcl $host $port $seconds $::tls & } # Stop a process generating write load executed with start_write_load. @@ -423,7 +423,7 @@ proc lshuffle {list} { # of ops to the specified Redis instance. proc start_bg_complex_data {host port db ops} { set tclsh [info nameofexecutable] - exec $tclsh tests/helpers/bg_complex_data.tcl $host $port $db $ops & + exec $tclsh tests/helpers/bg_complex_data.tcl $host $port $db $ops $::tls & } # Stop a process generating write load executed with start_bg_complex_data. diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 1442067f5..cb7e4e328 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -63,6 +63,7 @@ set ::all_tests { unit/lazyfree unit/wait unit/pendingquerybuf + unit/tls } # Index to the next test to run in the ::all_tests list. set ::next_test 0 @@ -71,6 +72,7 @@ set ::host 127.0.0.1 set ::port 21111 set ::traceleaks 0 set ::valgrind 0 +set ::tls 0 set ::stack_logging 0 set ::verbose 0 set ::quiet 0 @@ -92,6 +94,7 @@ set ::dont_clean 0 set ::wait_server 0 set ::stop_on_failure 0 set ::loop 0 +set ::tlsdir "tests/tls" # Set to 1 when we are running in client mode. The Redis test uses a # server-client model to run tests simultaneously. The server instance @@ -146,7 +149,7 @@ proc reconnect {args} { set host [dict get $srv "host"] set port [dict get $srv "port"] set config [dict get $srv "config"] - set client [redis $host $port] + set client [redis $host $port 0 $::tls] dict set srv "client" $client # select the right db when we don't have to authenticate @@ -166,7 +169,7 @@ proc redis_deferring_client {args} { } # create client that defers reading reply - set client [redis [srv $level "host"] [srv $level "port"] 1] + set client [redis [srv $level "host"] [srv $level "port"] 1 $::tls] # select the right db and read the response (OK) $client select 9 @@ -204,7 +207,7 @@ proc test_server_main {} { if {!$::quiet} { puts "Starting test server at port $port" } - socket -server accept_test_clients -myaddr 127.0.0.1 $port + socket -server accept_test_clients -myaddr 127.0.0.1 $port # Start the client instances set ::clients_pids {} @@ -450,6 +453,7 @@ proc print_help_screen {} { "--stop Blocks once the first test fails." "--loop Execute the specified set of tests forever." "--wait-server Wait after server is started (so that you can attach a debugger)." + "--tls Run tests in TLS mode." "--help Print this help screen." } "\n"] } @@ -486,6 +490,13 @@ for {set j 0} {$j < [llength $argv]} {incr j} { } } elseif {$opt eq {--quiet}} { set ::quiet 1 + } elseif {$opt eq {--tls}} { + package require tls 1.6 + set ::tls 1 + ::tls::init \ + -cafile "$::tlsdir/ca.crt" \ + -certfile "$::tlsdir/redis.crt" \ + -keyfile "$::tlsdir/redis.key" } elseif {$opt eq {--host}} { set ::external 1 set ::host $arg @@ -565,7 +576,11 @@ if {[llength $::single_tests] > 0} { } proc attach_to_replication_stream {} { - set s [socket [srv 0 "host"] [srv 0 "port"]] + if {$::tls} { + set s [::tls::socket [srv 0 "host"] [srv 0 "port"]] + } else { + set s [socket [srv 0 "host"] [srv 0 "port"]] + } fconfigure $s -translation binary puts -nonewline $s "SYNC\r\n" flush $s diff --git a/tests/unit/limits.tcl b/tests/unit/limits.tcl index b37ea9b0f..38ba76208 100644 --- a/tests/unit/limits.tcl +++ b/tests/unit/limits.tcl @@ -1,4 +1,9 @@ start_server {tags {"limits"} overrides {maxclients 10}} { + if {$::tls} { + set expected_code "*I/O error*" + } else { + set expected_code "*ERR max*reached*" + } test {Check if maxclients works refusing connections} { set c 0 catch { @@ -12,5 +17,5 @@ start_server {tags {"limits"} overrides {maxclients 10}} { } e assert {$c > 8 && $c <= 10} set e - } {*ERR max*reached*} + } $expected_code } diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index 965902456..7720c055a 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -166,7 +166,11 @@ start_server {tags {"other"}} { tags {protocol} { test {PIPELINING stresser (also a regression for the old epoll bug)} { - set fd2 [socket $::host $::port] + if {$::tls} { + set fd2 [::tls::socket $::host $::port] + } else { + set fd2 [socket $::host $::port] + } fconfigure $fd2 -encoding binary -translation binary puts -nonewline $fd2 "SELECT 9\r\n" flush $fd2 diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl index ac99c3abb..4dfdc6f59 100644 --- a/tests/unit/protocol.tcl +++ b/tests/unit/protocol.tcl @@ -72,7 +72,11 @@ start_server {tags {"protocol"}} { foreach seq [list "\x00" "*\x00" "$\x00"] { incr c test "Protocol desync regression test #$c" { - set s [socket [srv 0 host] [srv 0 port]] + if {$::tls} { + set s [::tls::socket [srv 0 host] [srv 0 port]] + } else { + set s [socket [srv 0 host] [srv 0 port]] + } puts -nonewline $s $seq set payload [string repeat A 1024]"\n" set test_start [clock seconds] diff --git a/tests/unit/tls.tcl b/tests/unit/tls.tcl new file mode 100644 index 000000000..950f65557 --- /dev/null +++ b/tests/unit/tls.tcl @@ -0,0 +1,105 @@ +start_server {tags {"tls"}} { + if {$::tls} { + package require tls + + test {TLS: Not accepting non-TLS connections on a TLS port} { + set s [redis [srv 0 host] [srv 0 port]] + catch {$s PING} e + set e + } {*I/O error*} + + test {TLS: Verify tls-auth-clients behaves as expected} { + set s [redis [srv 0 host] [srv 0 port]] + ::tls::import [$s channel] + catch {$s PING} e + assert_match {*error*} $e + + r CONFIG SET tls-auth-clients no + + set s [redis [srv 0 host] [srv 0 port]] + ::tls::import [$s channel] + catch {$s PING} e + assert_match {PONG} $e + + r CONFIG SET tls-auth-clients yes + } + + test {TLS: Verify tls-protocols behaves as expected} { + r CONFIG SET tls-protocols TLSv1 + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-tls1 0}] + catch {$s PING} e + assert_match {*I/O error*} $e + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-tls1 1}] + catch {$s PING} e + assert_match {PONG} $e + + r CONFIG SET tls-protocols TLSv1.1 + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-tls1.1 0}] + catch {$s PING} e + assert_match {*I/O error*} $e + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-tls1.1 1}] + catch {$s PING} e + assert_match {PONG} $e + + r CONFIG SET tls-protocols TLSv1.2 + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-tls1.2 0}] + catch {$s PING} e + assert_match {*I/O error*} $e + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-tls1.2 1}] + catch {$s PING} e + assert_match {PONG} $e + + r CONFIG SET tls-protocols "" + } + + test {TLS: Verify tls-ciphers behaves as expected} { + r CONFIG SET tls-protocols TLSv1.2 + r CONFIG SET tls-ciphers "DEFAULT:-AES128-SHA256" + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-cipher "-ALL:AES128-SHA256"}] + catch {$s PING} e + assert_match {*I/O error*} $e + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-cipher "-ALL:AES256-SHA256"}] + catch {$s PING} e + assert_match {PONG} $e + + r CONFIG SET tls-ciphers "DEFAULT" + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-cipher "-ALL:AES128-SHA256"}] + catch {$s PING} e + assert_match {PONG} $e + + r CONFIG SET tls-protocols "" + r CONFIG SET tls-ciphers "DEFAULT" + } + + test {TLS: Verify tls-prefer-server-ciphers behaves as expected} { + r CONFIG SET tls-protocols TLSv1.2 + r CONFIG SET tls-ciphers "AES128-SHA256:AES256-SHA256" + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-cipher "AES256-SHA256:AES128-SHA256"}] + catch {$s PING} e + assert_match {PONG} $e + + assert_equal "AES256-SHA256" [dict get [::tls::status [$s channel]] cipher] + + r CONFIG SET tls-prefer-server-ciphers yes + + set s [redis [srv 0 host] [srv 0 port] 0 1 {-cipher "AES256-SHA256:AES128-SHA256"}] + catch {$s PING} e + assert_match {PONG} $e + + assert_equal "AES128-SHA256" [dict get [::tls::status [$s channel]] cipher] + + r CONFIG SET tls-protocols "" + r CONFIG SET tls-ciphers "DEFAULT" + } + } +} diff --git a/tests/unit/wait.tcl b/tests/unit/wait.tcl index e2f5d2942..c9cfa6ed4 100644 --- a/tests/unit/wait.tcl +++ b/tests/unit/wait.tcl @@ -1,3 +1,5 @@ +source tests/support/cli.tcl + start_server {tags {"wait"}} { start_server {} { set slave [srv 0 client] @@ -31,7 +33,8 @@ start_server {} { } test {WAIT should not acknowledge 1 additional copy if slave is blocked} { - exec src/redis-cli -h $slave_host -p $slave_port debug sleep 5 > /dev/null 2> /dev/null & + set cmd [rediscli $slave_port "-h $slave_host debug sleep 5"] + exec {*}$cmd > /dev/null 2> /dev/null & after 1000 ;# Give redis-cli the time to execute the command. $master set foo 0 $master incr foo diff --git a/utils/gen-test-certs.sh b/utils/gen-test-certs.sh new file mode 100755 index 000000000..a46edc55a --- /dev/null +++ b/utils/gen-test-certs.sh @@ -0,0 +1,23 @@ +#!/bin/bash +mkdir -p tests/tls +openssl genrsa -out tests/tls/ca.key 4096 +openssl req \ + -x509 -new -nodes -sha256 \ + -key tests/tls/ca.key \ + -days 3650 \ + -subj '/O=Redis Test/CN=Certificate Authority' \ + -out tests/tls/ca.crt +openssl genrsa -out tests/tls/redis.key 2048 +openssl req \ + -new -sha256 \ + -key tests/tls/redis.key \ + -subj '/O=Redis Test/CN=Server' | \ + openssl x509 \ + -req -sha256 \ + -CA tests/tls/ca.crt \ + -CAkey tests/tls/ca.key \ + -CAserial tests/tls/ca.txt \ + -CAcreateserial \ + -days 365 \ + -out tests/tls/redis.crt +openssl dhparam -out tests/tls/redis.dh 2048