From dbcc0a85d070f94f5c524a4bbdd3e492c32e845b Mon Sep 17 00:00:00 2001 From: Yossi Gottlieb Date: Mon, 8 Feb 2021 17:02:46 +0200 Subject: [PATCH] Fix and cleanup Sentinel leaked fds test. (#8469) * For consistency, use tclsh for the script as well * Ignore leaked fds that originate from grandparent process, since we only care about fds redis-sentinel itself is responsible for * Check every test iteration to catch problems early * Some cleanups, e.g. parameterization of file name, etc. --- tests/instances.tcl | 27 +++---- tests/sentinel/run.tcl | 3 + .../tests/helpers/check_leaked_fds.tcl | 70 +++++++++++++++++++ tests/sentinel/tests/includes/init-tests.tcl | 6 +- tests/sentinel/tests/includes/notify.sh | 21 ------ 5 files changed, 91 insertions(+), 36 deletions(-) create mode 100755 tests/sentinel/tests/helpers/check_leaked_fds.tcl delete mode 100755 tests/sentinel/tests/includes/notify.sh diff --git a/tests/instances.tcl b/tests/instances.tcl index 8cb616ae8..793bce80d 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -29,6 +29,7 @@ set ::sentinel_base_port 20000 set ::redis_base_port 30000 set ::redis_port_count 1024 set ::host "127.0.0.1" +set ::leaked_fds_file [file normalize "tmp/leaked_fds.txt"] set ::pids {} ; # We kill everything at exit set ::dirs {} ; # We remove all the temp dirs at exit set ::run_matching {} ; # If non empty, only tests matching pattern are run. @@ -410,13 +411,13 @@ proc check_leaks instance_types { # Execute all the units inside the 'tests' directory. proc run_tests {} { - set sentinel_fd_leaks_file "sentinel_fd_leaks" - if { [file exists $sentinel_fd_leaks_file] } { - file delete $sentinel_fd_leaks_file - } - set tests [lsort [glob ../tests/*]] foreach test $tests { + # Remove leaked_fds file before starting + if {$::leaked_fds_file != "" && [file exists $::leaked_fds_file]} { + file delete $::leaked_fds_file + } + if {$::run_matching ne {} && [string match $::run_matching $test] == 0} { continue } @@ -424,19 +425,19 @@ proc run_tests {} { puts [colorstr yellow "Testing unit: [lindex [file split $test] end]"] source $test check_leaks {redis sentinel} + + # Check if a leaked fds file was created and abort the test. + if {$::leaked_fds_file != "" && [file exists $::leaked_fds_file]} { + puts [colorstr red "ERROR: Sentinel has leaked fds to scripts:"] + puts [exec cat $::leaked_fds_file] + puts "----" + incr ::failed + } } } # Print a message and exists with 0 / 1 according to zero or more failures. proc end_tests {} { - set sentinel_fd_leaks_file "sentinel_fd_leaks" - if { [file exists $sentinel_fd_leaks_file] } { - # temporarily disabling this error from failing the tests until leaks are fixed. - #puts [colorstr red "WARNING: sentinel test(s) failed, there are leaked fds in sentinel:"] - #puts [exec cat $sentinel_fd_leaks_file] - #exit 1 - } - if {$::failed == 0 } { puts "GOOD! No errors." exit 0 diff --git a/tests/sentinel/run.tcl b/tests/sentinel/run.tcl index c275aa762..64c99c103 100644 --- a/tests/sentinel/run.tcl +++ b/tests/sentinel/run.tcl @@ -10,6 +10,9 @@ set ::tlsdir "../../tls" proc main {} { parse_options + if {$::leaked_fds_file != ""} { + set ::env(LEAKED_FDS_FILE) $::leaked_fds_file + } spawn_instance sentinel $::sentinel_base_port $::instances_count [list "sentinel deny-scripts-reconfig no"] "../tests/includes/sentinel.conf" spawn_instance redis $::redis_base_port $::instances_count run_tests diff --git a/tests/sentinel/tests/helpers/check_leaked_fds.tcl b/tests/sentinel/tests/helpers/check_leaked_fds.tcl new file mode 100755 index 000000000..cb7b5d3d9 --- /dev/null +++ b/tests/sentinel/tests/helpers/check_leaked_fds.tcl @@ -0,0 +1,70 @@ +#!/usr/bin/env tclsh +# +# This script detects file descriptors that have leaked from a parent process. +# +# Our goal is to detect file descriptors that were opened by the parent and +# not cleaned up prior to exec(), but not file descriptors that were inherited +# from the grandparent which the parent knows nothing about. To do that, we +# look up every potential leak and try to match it against open files by the +# grandparent process. + +# Get PID of parent process +proc get_parent_pid {_pid} { + set fd [open "/proc/$_pid/status" "r"] + set content [read $fd] + close $fd + + if {[regexp {\nPPid:\s+(\d+)} $content _ ppid]} { + return $ppid + } + + error "failed to get parent pid" +} + +# Linux only +set os [exec uname] +if {$os != "Linux"} { + puts "Only Linux is supported." + exit 0 +} + +if {![info exists env(LEAKED_FDS_FILE)]} { + puts "Missing LEAKED_FDS_FILE environment variable." + exit 0 +} + +set outfile $::env(LEAKED_FDS_FILE) +set parent_pid [get_parent_pid [pid]] +set grandparent_pid [get_parent_pid $parent_pid] +set leaked_fds {} + +# Look for fds that were directly inherited from our parent but not from +# our grandparent (tcl) +foreach fd [glob -tails -directory "/proc/self/fd" *] { + # Ignore stdin/stdout/stderr + if {$fd == 0 || $fd == 1 || $fd == 2} { + continue + } + + # Read symlink to get info about the file descriptor. This can be a real + # file name or an arbitrary string that identifies the fd. + if { [catch {set fdlink [file readlink "/proc/self/fd/$fd"]} err] } { + continue + } + + # See if grandparent process has the same fd and info and skip if it does. + if { ! [catch {set gp_fdlink [file readlink "/proc/$grandparent_pid/fd/$fd"]} err] } { + if {$gp_fdlink == $fdlink} { + continue + } + } + + lappend leaked_fds [list $fd $fdlink] +} + +# Produce report only if we found leaks +if {[llength $leaked_fds] > 0} { + set fd [open $outfile "w"] + puts $fd [join $leaked_fds "\n"] + close $fd +} diff --git a/tests/sentinel/tests/includes/init-tests.tcl b/tests/sentinel/tests/includes/init-tests.tcl index b4626caed..be6a8f502 100644 --- a/tests/sentinel/tests/includes/init-tests.tcl +++ b/tests/sentinel/tests/includes/init-tests.tcl @@ -41,8 +41,10 @@ test "(init) Sentinels can start monitoring a master" { S $id SENTINEL SET mymaster down-after-milliseconds 2000 S $id SENTINEL SET mymaster failover-timeout 20000 S $id SENTINEL SET mymaster parallel-syncs 10 - S $id SENTINEL SET mymaster notification-script ../../tests/includes/notify.sh - S $id SENTINEL SET mymaster client-reconfig-script ../../tests/includes/notify.sh + if {$::leaked_fds_file != ""} { + S $id SENTINEL SET mymaster notification-script ../../tests/helpers/check_leaked_fds.tcl + S $id SENTINEL SET mymaster client-reconfig-script ../../tests/helpers/check_leaked_fds.tcl + } } } diff --git a/tests/sentinel/tests/includes/notify.sh b/tests/sentinel/tests/includes/notify.sh deleted file mode 100755 index 5de0eaf76..000000000 --- a/tests/sentinel/tests/includes/notify.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash - -OS=`uname -s` -if [ ${OS} != "Linux" ] -then - exit 0 -fi - -# fd 3 is meant to catch the actual access to /proc/pid/fd, -# in case there's an fd leak by the sentinel, -# it can take 3, but then the access to /proc will take another fd, and we'll catch that. -leaked_fd_count=`ls /proc/self/fd | grep -vE '^[0|1|2|3]$' | wc -l` -if [ $leaked_fd_count -gt 0 ] -then - sentinel_fd_leaks_file="../sentinel_fd_leaks" - if [ ! -f $sentinel_fd_leaks_file ] - then - ls -l /proc/self/fd | cat >> $sentinel_fd_leaks_file - lsof -p $$ | cat >> $sentinel_fd_leaks_file - fi -fi