#ifndef __CLUSTER_H #define __CLUSTER_H /*----------------------------------------------------------------------------- * Redis cluster data structures, defines, exported API. *----------------------------------------------------------------------------*/ #define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ #define CLUSTER_SLOTS (1<flags & CLUSTER_NODE_MASTER) #define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE) #define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) #define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) #define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR) #define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) #define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) /* Message types. * * Note that the PING, PONG and MEET messages are actually the same exact * kind of packet. PONG is the reply to ping, in the exact format as a PING, * while MEET is a special PING that forces the receiver to add the sender * as a node (if it is not already in the list). */ #define CLUSTERMSG_TYPE_PING 0 /* Ping */ #define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ #define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ #define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ #define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ #define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ #define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ #define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ #define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ #define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ #define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ #define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ /* Flags that a module can set in order to prevent certain Redis Cluster * features to be enabled. Useful when implementing a different distributed * system on top of Redis Cluster message bus, using modules. */ #define CLUSTER_MODULE_FLAG_NONE 0 #define CLUSTER_MODULE_FLAG_NO_FAILOVER (1<<1) #define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2) /* This structure represent elements of node->fail_reports. */ typedef struct clusterNodeFailReport { struct clusterNode *node; /* Node reporting the failure condition. */ mstime_t time; /* Time of the last report from this node. */ } clusterNodeFailReport; typedef struct clusterNode { mstime_t ctime; /* Node object creation time. */ char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */ int flags; /* CLUSTER_NODE_... */ uint64_t configEpoch; /* Last configEpoch observed for this node */ unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */ uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */ int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */ int numslots; /* Number of slots handled by this node */ int numslaves; /* Number of slave nodes, if this is a master */ struct clusterNode **slaves; /* pointers to slave nodes */ struct clusterNode *slaveof; /* pointer to the master node. Note that it may be NULL even if the node is a slave if we don't have the master node in our tables. */ unsigned long long last_in_ping_gossip; /* The number of the last carried in the ping gossip section */ mstime_t ping_sent; /* Unix time we sent latest ping */ mstime_t pong_received; /* Unix time we received the pong */ mstime_t data_received; /* Unix time we received any data */ mstime_t fail_time; /* Unix time when FAIL flag was set */ mstime_t voted_time; /* Last time we voted for a slave of this master */ mstime_t repl_offset_time; /* Unix time we received offset for this node */ mstime_t orphaned_time; /* Starting time of orphaned master condition */ long long repl_offset; /* Last known repl offset for this node. */ char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ sds hostname; /* The known hostname for this node */ sds human_nodename; /* The known human readable nodename for this node */ int tcp_port; /* Latest known clients TCP port. */ int tls_port; /* Latest known clients TLS port */ int cport; /* Latest known cluster port of this node. */ clusterLink *link; /* TCP/IP link established toward this node */ clusterLink *inbound_link; /* TCP/IP link accepted from this node */ list *fail_reports; /* List of nodes signaling this as failing */ } clusterNode; typedef struct clusterState { clusterNode *myself; /* This node */ uint64_t currentEpoch; int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ int size; /* Num of master nodes with at least one slot */ dict *nodes; /* Hash table of name -> clusterNode structures */ dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ clusterNode *migrating_slots_to[CLUSTER_SLOTS]; clusterNode *importing_slots_from[CLUSTER_SLOTS]; clusterNode *slots[CLUSTER_SLOTS]; rax *slots_to_channels; /* The following fields are used to take the slave state on elections. */ mstime_t failover_auth_time; /* Time of previous or next election. */ int failover_auth_count; /* Number of votes received so far. */ int failover_auth_sent; /* True if we already asked for votes. */ int failover_auth_rank; /* This slave rank for current auth request. */ uint64_t failover_auth_epoch; /* Epoch of the current election. */ int cant_failover_reason; /* Why a slave is currently not able to failover. See the CANT_FAILOVER_* macros. */ /* Manual failover state in common. */ mstime_t mf_end; /* Manual failover time limit (ms unixtime). It is zero if there is no MF in progress. */ /* Manual failover state of master. */ clusterNode *mf_slave; /* Slave performing the manual failover. */ /* Manual failover state of slave. */ long long mf_master_offset; /* Master offset the slave needs to start MF or -1 if still not received. */ int mf_can_start; /* If non-zero signal that the manual failover can start requesting masters vote. */ /* The following fields are used by masters to take state on elections. */ uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ /* Stats */ /* Messages received and sent by type. */ long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT]; long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT]; long long stats_pfail_nodes; /* Number of nodes in PFAIL status, excluding nodes without address. */ unsigned long long stat_cluster_links_buffer_limit_exceeded; /* Total number of cluster links freed due to exceeding buffer limit */ /* Bit map for slots that are no longer claimed by the owner in cluster PING * messages. During slot migration, the owner will stop claiming the slot after * the ownership transfer. Set the bit corresponding to the slot when a node * stops claiming the slot. This prevents spreading incorrect information (that * source still owns the slot) using UPDATE messages. */ unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; } clusterState; /* ---------------------- API exported outside cluster.c -------------------- */ void clusterInit(void); void clusterInitListeners(void); void clusterCron(void); void clusterBeforeSleep(void); clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); int verifyClusterNodeId(const char *name, int length); clusterNode *clusterLookupNode(const char *name, int length); int clusterRedirectBlockedClientIfNeeded(client *c); void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code); void migrateCloseTimedoutSockets(void); int verifyClusterConfigWithData(void); unsigned long getClusterConnectionsCount(void); int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len); void clusterPropagatePublish(robj *channel, robj *message, int sharded); unsigned int keyHashSlot(char *key, int keylen); int patternHashSlot(char *pattern, int length); void clusterUpdateMyselfFlags(void); void clusterUpdateMyselfIp(void); void slotToChannelAdd(sds channel); void slotToChannelDel(sds channel); void clusterUpdateMyselfHostname(void); void clusterUpdateMyselfAnnouncedPorts(void); sds clusterGenNodesDescription(client *c, int filter, int tls_primary); sds genClusterInfoString(void); void freeClusterLink(clusterLink *link); int clusterNodeGetSlotBit(clusterNode *n, int slot); void clusterUpdateMyselfHumanNodename(void); int isValidAuxString(char *s, unsigned int length); int getNodeDefaultClientPort(clusterNode *n); #endif /* __CLUSTER_H */