replication_test.py

import os
import re
import time
from distutils.version import LooseVersion

import pytest
import logging

from collections import defaultdict

from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement

from dtest import DtestTimeoutError, Tester, create_ks

since = pytest.mark.since
logger = logging.getLogger(__name__)

TRACE_DETERMINE_REPLICAS = re.compile('Determining replicas for mutation')
TRACE_SEND_MESSAGE = re.compile(r'Sending (?:MUTATION|MUTATION_REQ|REQUEST_RESPONSE) message to /([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)')
TRACE_RESPOND_MESSAGE = re.compile(r'(?:MUTATION||MUTATION_REQ|REQUEST_RESPONSE) message received from /([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)')
TRACE_COMMIT_LOG = re.compile('Appending to commitlog')
TRACE_FORWARD_WRITE = re.compile(r'Enqueuing forwarded write to /([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)')

# Some pre-computed murmur 3 hashes; there are no good python murmur3
# hashing libraries :(
murmur3_hashes = {
    5: -7509452495886106294,
    10: -6715243485458697746,
    16: -5477287129830487822,
    13: -5034495173465742853,
    11: -4156302194539278891,
    1: -4069959284402364209,
    19: -3974532302236993209,
    8: -3799847372828181882,
    2: -3248873570005575792,
    4: -2729420104000364805,
    18: -2695747960476065067,
    15: -1191135763843456182,
    20: 1388667306199997068,
    7: 1634052884888577606,
    6: 2705480034054113608,
    9: 3728482343045213994,
    14: 4279681877540623768,
    17: 5467144456125416399,
    12: 8582886034424406875,
    3: 9010454139840013625
}


def query_system_traces_length(session):
    return len(list(session.execute("SELECT * FROM system_traces.events")))


def last_n_values_same(n, iterable):
    last_n_values = iterable[-n:]
    if len(last_n_values) != n:
        return False
    num_unique_values_in_last_n = len(set(last_n_values))
    return num_unique_values_in_last_n == 1


def block_on_trace(session):
    results_from_query = []
    num_same_results_required = 5

    # We should never run into a timeout, because
    # eventually trace events should stop being generated.
    # Just in case though, we add a large timeout, to prevent
    # deadlock.
    start = time.time()
    timeout = start + 180

    while not last_n_values_same(num_same_results_required, results_from_query):
        results_from_query.append(query_system_traces_length(session))
        time.sleep(1)

        if time.time() > timeout:
            raise DtestTimeoutError()


@pytest.mark.no_vnodes
class TestReplication(Tester):
    """
    This test suite looks at how data is replicated across a cluster
    and who the coordinator, replicas and forwarders involved are.
    """

    def get_replicas_from_trace(self, trace):
        """
        Look at trace and return a list of the replicas contacted
        """
        coordinator = None
        nodes_sent_write = set()  # Nodes sent a write request
        nodes_responded_write = set()  # Nodes that acknowledges a write
        replicas_written = set()  # Nodes that wrote to their commitlog
        forwarders = set()  # Nodes that forwarded a write to another node
        nodes_contacted = defaultdict(set)  # node -> list of nodes that were contacted

        for trace_event in trace.events:
            # Step 1, find coordinator node:
            activity = trace_event.description
            source = trace_event.source
            if activity.startswith('Determining replicas for mutation'):
                if not coordinator:
                    coordinator = source
                    break
            if not coordinator:
                continue

        for trace_event in trace.events:
            activity = trace_event.description
            source = trace_event.source

            # Step 2, find all the nodes that each node talked to:
            send_match = TRACE_SEND_MESSAGE.search(activity)
            recv_match = TRACE_RESPOND_MESSAGE.search(activity)
            if send_match:
                node_contacted = send_match.groups()[0]
                if source == coordinator:
                    nodes_sent_write.add(node_contacted)
                nodes_contacted[source].add(node_contacted)
            elif recv_match:
                node_contacted = recv_match.groups()[0]
                if source == coordinator:
                    nodes_responded_write.add(recv_match.groups()[0])

            # Step 3, find nodes that forwarded to other nodes:
            # (Happens in multi-datacenter clusters)
            if source != coordinator:
                forward_match = TRACE_FORWARD_WRITE.search(activity)
                if forward_match:
                    forwarding_node = forward_match.groups()[0]
                    nodes_sent_write.add(forwarding_node)
                    forwarders.add(forwarding_node)

            # Step 4, find nodes who actually wrote data:
            if TRACE_COMMIT_LOG.search(activity):
                replicas_written.add(source)

        return {"coordinator": coordinator,
                "forwarders": forwarders,
                "replicas": replicas_written,
                "nodes_sent_write": nodes_sent_write,
                "nodes_responded_write": nodes_responded_write,
                "nodes_contacted": nodes_contacted
                }

    def get_replicas_for_token(self, token, replication_factor,
                               strategy='SimpleStrategy', nodes=None):
        """
        Figure out which node(s) should receive data for a given token and
        replication factor
        """
        if not nodes:
            nodes = self.cluster.nodelist()
        token_ranges = sorted(zip([n.initial_token for n in nodes], nodes))
        replicas = []

        # Find first replica:
        for i, (r, node) in enumerate(token_ranges):
            if token <= r:
                replicas.append(node.address())
                first_ring_position = i
                break
        else:
            replicas.append(token_ranges[0][1].address())
            first_ring_position = 0

        # Find other replicas:
        if strategy == 'SimpleStrategy':
            for node in nodes[first_ring_position + 1:]:
                replicas.append(node.address())
                if len(replicas) == replication_factor:
                    break
            if len(replicas) != replication_factor:
                # Replication token range looped:
                for node in nodes:
                    replicas.append(node.address())
                    if len(replicas) == replication_factor:
                        break
        elif strategy == 'NetworkTopologyStrategy':
            # NetworkTopologyStrategy can be broken down into multiple
            # SimpleStrategies, just once per datacenter:
            for dc, rf in list(replication_factor.items()):
                dc_nodes = [n for n in nodes if n.data_center == dc]
                replicas.extend(self.get_replicas_for_token(
                    token, rf, nodes=dc_nodes))
        else:
            raise NotImplemented('replication strategy not implemented: %s'
                                 % strategy)

        return replicas

    def pprint_trace(self, trace):
        """
        Pretty print a trace
        """
        if logging.root.level == logging.DEBUG:
            print(("-" * 40))
            for t in trace.events:
                print(("%s\t%s\t%s\t%s" % (t.source, t.source_elapsed, t.description, t.thread_name)))
            print(("-" * 40))

    def test_simple(self):
        """
        Test the SimpleStrategy on a 3 node cluster
        """
        self.cluster.populate(3).start()
        node1 = self.cluster.nodelist()[0]
        session = self.patient_exclusive_cql_connection(node1, consistency_level=ConsistencyLevel.ALL)
        session.max_trace_wait = 120

        replication_factor = 3
        create_ks(session, 'test', replication_factor)
        session.execute('CREATE TABLE test.test (id int PRIMARY KEY, value text)', trace=False)

        for key, token in list(murmur3_hashes.items()):
            logger.debug('murmur3 hash key={key},token={token}'.format(key=key, token=token))
            query = SimpleStatement("INSERT INTO test (id, value) VALUES ({}, 'asdf')".format(key), consistency_level=ConsistencyLevel.ALL)
            future = session.execute_async(query, trace=True)
            future.result()
            block_on_trace(session)

            trace = future.get_query_trace(max_wait=120)
            self.pprint_trace(trace)

            stats = self.get_replicas_from_trace(trace)
            replicas_should_be = set(self.get_replicas_for_token(
                token, replication_factor))
            logger.debug('\nreplicas should be: %s' % replicas_should_be)
            logger.debug('replicas were: %s' % stats['replicas'])

            # Make sure the correct nodes are replicas:
            assert stats['replicas'] == replicas_should_be
            # Make sure that each replica node was contacted and
            # acknowledged the write:
            assert stats['nodes_sent_write'] == stats['nodes_responded_write']

    @pytest.mark.resource_intensive
    def test_network_topology(self):
        """
        Test the NetworkTopologyStrategy on a 2DC 3:3 node cluster
        """
        self.cluster.populate([3, 3]).start()

        node1 = self.cluster.nodelist()[0]
        ip_nodes = dict((node.address(), node) for node in self.cluster.nodelist())
        session = self.patient_exclusive_cql_connection(node1, consistency_level=ConsistencyLevel.ALL)

        replication_factor = {'dc1': 2, 'dc2': 2}
        create_ks(session, 'test', replication_factor)
        session.execute('CREATE TABLE test.test (id int PRIMARY KEY, value text)', trace=False)

        forwarders_used = set()

        for key, token in list(murmur3_hashes.items()):
            query = SimpleStatement("INSERT INTO test (id, value) VALUES ({}, 'asdf')".format(key), consistency_level=ConsistencyLevel.ALL)
            future = session.execute_async(query, trace=True)
            future.result()
            block_on_trace(session)

            trace = future.get_query_trace(max_wait=120)
            self.pprint_trace(trace)

            stats = self.get_replicas_from_trace(trace)
            replicas_should_be = set(self.get_replicas_for_token(
                token, replication_factor, strategy='NetworkTopologyStrategy'))
            logger.debug('Current token is %s' % token)
            logger.debug('\nreplicas should be: %s' % replicas_should_be)
            logger.debug('replicas were: %s' % stats['replicas'])

            # Make sure the coordinator only talked to a single node in
            # the second datacenter - CASSANDRA-5632:
            num_in_other_dcs_contacted = 0
            for node_contacted in stats['nodes_contacted'][node1.address()]:
                if ip_nodes[node_contacted].data_center != node1.data_center:
                    num_in_other_dcs_contacted += 1
            assert num_in_other_dcs_contacted == 1

            # Record the forwarder used for each INSERT:
            forwarders_used = forwarders_used.union(stats['forwarders'])

            try:
                # Make sure the correct nodes are replicas:
                assert stats['replicas'] == replicas_should_be
                # Make sure that each replica node was contacted and
                # acknowledged the write:
                assert stats['nodes_sent_write'] == stats['nodes_responded_write']
            except AssertionError as e:
                logger.debug("Failed on key %s and token %s." % (key, token))
                raise e

        # Given a diverse enough keyset, each node in the second
        # datacenter should get a chance to be a forwarder:
        assert len(forwarders_used) == 3


class TestSnitchConfigurationUpdate(Tester):
    """
    Test to reproduce CASSANDRA-10238, wherein changing snitch properties to change racks without a restart
    could violate RF contract.

    Since CASSANDRA-10243 it is no longer possible to change rack or dc for live nodes so we must specify
    which nodes should be shutdown in order to have the rack changed.
    """

    @pytest.fixture(autouse=True)
    def fixture_add_additional_log_patterns(self, fixture_dtest_setup):
        fixture_dtest_setup.ignore_log_patterns = (
            "Fatal exception during initialization",
            "Cannot start node if snitch's rack(.*) differs from previous rack(.*)",
            "Cannot update data center or rack"
        )

    def check_endpoint_count(self, ks, table, nodes, rf):
        """
        Check a dummy key expecting it to have replication factor as the sum of rf on all dcs.
        """
        expected_count = sum([int(r) for d, r in rf.items() if d != 'class'])
        for node in nodes:
            cmd = "getendpoints {} {} dummy".format(ks, table)
            out, err, _ = node.nodetool(cmd)

            if len(err.strip()) > 0:
                logger.debug("Error running 'nodetool {}': {}".format(cmd, err))

            logger.debug("Endpoints for node {}, expected count is {}".format(node.address(), expected_count))
            logger.debug(out)
            ips_found = re.findall('(\d+\.\d+\.\d+\.\d+)', out)

            assert len(ips_found) == expected_count, "wrong number of endpoints found ({}), should be: {}".format(len(ips_found), expected_count)

    def wait_for_nodes_on_racks(self, nodes, expected_racks):
        """
        Waits for nodes to match the expected racks.
        """
        regex = re.compile(r"^UN(?:\s*)127\.0\.0(?:.*)\s(.*)$", re.IGNORECASE)
        for i, node in enumerate(nodes):
            wait_expire = time.time() + 120
            while time.time() < wait_expire:
                out, err, _ = node.nodetool("status")

                logger.debug(out)
                if len(err.strip()) > 0:
                    logger.debug("Error trying to run nodetool status: {}".format(err))

                racks = []
                for line in out.split(os.linesep):
                    m = regex.match(line)
                    if m:
                        racks.append(m.group(1))
                racks.sort() #order is not deterministic
                if racks == sorted(expected_racks.copy()):
                    # great, the topology change is propagated
                    logger.debug("Topology change detected on node {}".format(i))
                    break
                else:
                    logger.debug("Waiting for topology change on node {}".format(i))
                    time.sleep(5)
            else:
                raise RuntimeError("Ran out of time waiting for topology to change on node {}".format(i))

    @since('2.1', max_version='5.0.x')
    def test_rf_collapse_gossiping_property_file_snitch(self):
        """
        @jira_ticket CASSANDRA-10238
        @jira_ticket CASSANDRA-10242
        @jira_ticket CASSANDRA-10243

        Confirm that when racks are collapsed using a gossiping property file snitch the RF is not impacted.
        """
        self._test_rf_on_snitch_update(nodes=[3], rf={'class': '\'NetworkTopologyStrategy\'', 'dc1': 3},
                                       snitch_class_name='GossipingPropertyFileSnitch',
                                       snitch_config_file='cassandra-rackdc.properties',
                                       snitch_lines_before=lambda i, node: ["dc=dc1", "rack=rack{}".format(i)],
                                       snitch_lines_after=lambda i, node: ["dc=dc1", "rack=rack1"],
                                       final_racks=["rack1", "rack1", "rack1"],
                                       nodes_to_shutdown=[0, 2])

    @since('2.1', max_version='5.0.x')
    def test_rf_expand_gossiping_property_file_snitch(self):
        """
        @jira_ticket CASSANDRA-10238
        @jira_ticket CASSANDRA-10242
        @jira_ticket CASSANDRA-10243

        Confirm that when racks are expanded using a gossiping property file snitch the RF is not impacted.
        """
        self._test_rf_on_snitch_update(nodes=[3], rf={'class': '\'NetworkTopologyStrategy\'', 'dc1': 3},
                                       snitch_class_name='GossipingPropertyFileSnitch',
                                       snitch_config_file='cassandra-rackdc.properties',
                                       snitch_lines_before=lambda i, node: ["dc=dc1", "rack=rack1"],
                                       snitch_lines_after=lambda i, node: ["dc=dc1", "rack=rack{}".format(i)],
                                       final_racks=["rack0", "rack1", "rack2"],
                                       nodes_to_shutdown=[0, 2])

    @pytest.mark.resource_intensive
    @since('2.1', max_version='5.0.x')
    def test_rf_collapse_gossiping_property_file_snitch_multi_dc(self):
        """
        @jira_ticket CASSANDRA-10238
        @jira_ticket CASSANDRA-10242
        @jira_ticket CASSANDRA-10243

        Confirm that when racks are collapsed using a gossiping property file snitch the RF is not impacted, in a multi-dc environment.
        """
        self._test_rf_on_snitch_update(nodes=[3, 3], rf={'class': '\'NetworkTopologyStrategy\'', 'dc1': 3, 'dc2': 3},
                                       snitch_class_name='GossipingPropertyFileSnitch',
                                       snitch_config_file='cassandra-rackdc.properties',
                                       snitch_lines_before=lambda i, node: ["dc={}".format(node.data_center), "rack=rack{}".format(i % 3)],
                                       snitch_lines_after=lambda i, node: ["dc={}".format(node.data_center), "rack=rack1"],
                                       final_racks=["rack1", "rack1", "rack1", "rack1", "rack1", "rack1"],
                                       nodes_to_shutdown=[0, 2, 3, 5])

    @pytest.mark.resource_intensive
    @since('2.1', max_version='5.0.x')
    def test_rf_expand_gossiping_property_file_snitch_multi_dc(self):
        """
        @jira_ticket CASSANDRA-10238
        @jira_ticket CASSANDRA-10242
        @jira_ticket CASSANDRA-10243

        Confirm that when racks are expanded using a gossiping property file snitch the RF is not impacted, in a multi-dc environment.
        """
        self._test_rf_on_snitch_update(nodes=[3, 3], rf={'class': '\'NetworkTopologyStrategy\'', 'dc1': 3, 'dc2': 3},
                                       snitch_class_name='GossipingPropertyFileSnitch',
                                       snitch_config_file='cassandra-rackdc.properties',
                                       snitch_lines_before=lambda i, node: ["dc={}".format(node.data_center), "rack=rack1"],
                                       snitch_lines_after=lambda i, node: ["dc={}".format(node.data_center), "rack=rack{}".format(i % 3)],
                                       final_racks=["rack0", "rack1", "rack2", "rack0", "rack1", "rack2"],
                                       nodes_to_shutdown=[0, 2, 3, 5])

    @since('2.1', max_version='5.0.x')
    def test_rf_collapse_property_file_snitch(self):
        """
        @jira_ticket CASSANDRA-10238
        @jira_ticket CASSANDRA-10242
        @jira_ticket CASSANDRA-10243

        Confirm that when racks are collapsed using a property file snitch the RF is not impacted.
        """
        self._test_rf_on_snitch_update(nodes=[3], rf={'class': '\'NetworkTopologyStrategy\'', 'dc1': 3},
                                       snitch_class_name='PropertyFileSnitch',
                                       snitch_config_file='cassandra-topology.properties',
                                       snitch_lines_before=lambda i, node: ["127.0.0.1=dc1:rack0", "127.0.0.2=dc1:rack1", "127.0.0.3=dc1:rack2"],
                                       snitch_lines_after=lambda i, node: ["default=dc1:rack0"],
                                       final_racks=["rack0", "rack0", "rack0"],
                                       nodes_to_shutdown=[1, 2])

    @since('2.1', max_version='5.0.x')
    def test_rf_expand_property_file_snitch(self):
        """
        @jira_ticket CASSANDRA-10238
        @jira_ticket CASSANDRA-10242
        @jira_ticket CASSANDRA-10243

        Confirm that when racks are expanded using a property file snitch the RF is not impacted.
        """
        self._test_rf_on_snitch_update(nodes=[3], rf={'class': '\'NetworkTopologyStrategy\'', 'dc1': 3},
                                       snitch_class_name='PropertyFileSnitch',
                                       snitch_config_file='cassandra-topology.properties',
                                       snitch_lines_before=lambda i, node: ["default=dc1:rack0"],
                                       snitch_lines_after=lambda i, node: ["127.0.0.1=dc1:rack0", "127.0.0.2=dc1:rack1", "127.0.0.3=dc1:rack2"],
                                       final_racks=["rack0", "rack1", "rack2"],
                                       nodes_to_shutdown=[1, 2])

    @since('2.0', max_version='2.1.x')
    def test_rf_collapse_yaml_file_snitch(self):
        """
        @jira_ticket CASSANDRA-10238
        @jira_ticket CASSANDRA-10242
        @jira_ticket CASSANDRA-10243

        Confirm that when racks are collapsed using a yaml file snitch the RF is not impacted.
        """
        self._test_rf_on_snitch_update(nodes=[3], rf={'class': '\'NetworkTopologyStrategy\'', 'dc1': 3},
                                       snitch_class_name='YamlFileNetworkTopologySnitch',
                                       snitch_config_file='cassandra-topology.yaml',
                                       snitch_lines_before=lambda i, node: ["topology:",
                                                                            "  - dc_name: dc1",
                                                                            "    racks:",
                                                                            "    - rack_name: rack0",
                                                                            "      nodes:",
                                                                            "      - broadcast_address: 127.0.0.1",
                                                                            "    - rack_name: rack1",
                                                                            "      nodes:",
                                                                            "      - broadcast_address: 127.0.0.2",
                                                                            "    - rack_name: rack2",
                                                                            "      nodes:",
                                                                            "      - broadcast_address: 127.0.0.3"],
                                       snitch_lines_after=lambda i, node: ["topology:",
                                                                           "  - dc_name: dc1",
                                                                           "    racks:",
                                                                           "    - rack_name: rack0",
                                                                           "      nodes:",
                                                                           "      - broadcast_address: 127.0.0.1",
                                                                           "      - broadcast_address: 127.0.0.2",
                                                                           "      - broadcast_address: 127.0.0.3"],
                                       final_racks=["rack0", "rack0", "rack0"],
                                       nodes_to_shutdown=[1, 2])

    @since('2.0', max_version='2.1.x')
    def test_rf_expand_yaml_file_snitch(self):
        """
        @jira_ticket CASSANDRA-10238
        @jira_ticket CASSANDRA-10242
        @jira_ticket CASSANDRA-10243

        Confirm that when racks are expanded using a yaml file snitch the RF is not impacted.
        """
        self._test_rf_on_snitch_update(nodes=[3], rf={'class': '\'NetworkTopologyStrategy\'', 'dc1': 3},
                                       snitch_class_name='YamlFileNetworkTopologySnitch',
                                       snitch_config_file='cassandra-topology.yaml',
                                       snitch_lines_before=lambda i, node: ["topology:",
                                                                            "  - dc_name: dc1",
                                                                            "    racks:",
                                                                            "    - rack_name: rack0",
                                                                            "      nodes:",
                                                                            "      - broadcast_address: 127.0.0.1",
                                                                            "      - broadcast_address: 127.0.0.2",
                                                                            "      - broadcast_address: 127.0.0.3"],
                                       snitch_lines_after=lambda i, node: ["topology:",
                                                                           "  - dc_name: dc1",
                                                                           "    racks:",
                                                                           "    - rack_name: rack0",
                                                                           "      nodes:",
                                                                           "      - broadcast_address: 127.0.0.1",
                                                                           "    - rack_name: rack1",
                                                                           "      nodes:",
                                                                           "      - broadcast_address: 127.0.0.2",
                                                                           "    - rack_name: rack2",
                                                                           "      nodes:",
                                                                           "      - broadcast_address: 127.0.0.3"],
                                       final_racks=["rack0", "rack1", "rack2"],
                                       nodes_to_shutdown=[1, 2])

    def _test_rf_on_snitch_update(self, nodes, rf, snitch_class_name, snitch_config_file,
                                  snitch_lines_before, snitch_lines_after, final_racks, nodes_to_shutdown):
        cluster = self.cluster
        cluster.populate(nodes)
        cluster.set_configuration_options(
            values={'endpoint_snitch': 'org.apache.cassandra.locator.{}'.format(snitch_class_name)}
        )

        # start with separate racks
        for i, node in enumerate(cluster.nodelist()):
            with open(os.path.join(node.get_conf_dir(), snitch_config_file), 'w') as topo_file:
                for line in snitch_lines_before(i, node):
                    topo_file.write(line + os.linesep)

        cluster.start()

        session = self.patient_cql_connection(cluster.nodelist()[0])

        options = (', ').join(['\'{}\': {}'.format(d, r) for d, r in rf.items()])
        session.execute("CREATE KEYSPACE testing WITH replication = {{{}}}".format(options))
        session.execute("CREATE TABLE testing.rf_test (key text PRIMARY KEY, value text)")

        # avoid errors in nodetool calls below checking for the endpoint count
        session.cluster.control_connection.wait_for_schema_agreement()
        # make sure endpoint count is correct before continuing with the rest of the test
        self.check_endpoint_count('testing', 'rf_test', cluster.nodelist(), rf)

        for i in nodes_to_shutdown:
            node = cluster.nodelist()[i]
            logger.debug("Shutting down node {}".format(node.address()))
            node.stop(wait_other_notice=True)

        logger.debug("Updating snitch file")
        for i, node in enumerate(cluster.nodelist()):
            with open(os.path.join(node.get_conf_dir(), snitch_config_file), 'w') as topo_file:
                for line in snitch_lines_after(i, node):
                    topo_file.write(line + os.linesep)

        # wait until the config is reloaded before we restart the nodes, the default check period is
        # 5 seconds so we wait for 10 seconds to be sure
        logger.debug("Waiting 10 seconds to make sure snitch file is reloaded...")
        time.sleep(10)

        for i in nodes_to_shutdown:
            node = cluster.nodelist()[i]
            logger.debug("Restarting node {}".format(node.address()))
            # Since CASSANDRA-10242 it is no longer
            # possible to start a node with a different rack unless we specify -Dcassandra.ignore_rack and since
            # CASSANDRA-9474 it is no longer possible to start a node with a different dc unless we specify
            # -Dcassandra.ignore_dc.
            node.start(jvm_args=['-Dcassandra.ignore_rack=true', '-Dcassandra.ignore_dc=true'],
                       wait_for_binary_proto=True)

        self.wait_for_nodes_on_racks(cluster.nodelist(), final_racks)

        # nodes have joined racks, check endpoint counts again
        self.check_endpoint_count('testing', 'rf_test', cluster.nodelist(), rf)

    def test_cannot_restart_with_different_rack(self):
        """
        @jira_ticket CASSANDRA-10242

        Test that we cannot restart with a different rack if '-Dcassandra.ignore_rack=true' is not specified.
        """
        cluster = self.cluster
        cluster.populate(1)
        cluster.set_configuration_options(values={'endpoint_snitch': 'org.apache.cassandra.locator.{}'
                                                  .format('GossipingPropertyFileSnitch')})

        node1 = cluster.nodelist()[0]

        with open(os.path.join(node1.get_conf_dir(), 'cassandra-rackdc.properties'), 'w') as topo_file:
            for line in ["dc={}".format(node1.data_center), "rack=rack1"]:
                topo_file.write(line + os.linesep)

        logger.debug("Starting node {} with rack1".format(node1.address()))
        node1.start(wait_for_binary_proto=True)

        logger.debug("Shutting down node {}".format(node1.address()))
        node1.stop(wait_other_notice=True)

        logger.debug("Updating snitch file with rack2")
        for node in cluster.nodelist():
            with open(os.path.join(node.get_conf_dir(), 'cassandra-rackdc.properties'), 'w') as topo_file:
                for line in ["dc={}".format(node.data_center), "rack=rack2"]:
                    topo_file.write(line + os.linesep)

        logger.debug("Restarting node {} with rack2".format(node1.address()))
        mark = node1.mark_log()

        if cluster.version() >= LooseVersion('5.1'):
            node1.start(wait_for_binary_proto=True)
            regex = re.compile(r"^UN(?:\s*)127\.0\.0(?:.*)\s(.*)$", re.IGNORECASE)
            out, err, _ = node1.nodetool("status")
            rack_after_restart = ""
            for line in out.split(os.linesep):
                m = regex.match(line)
                if m:
                    rack_after_restart = m.group(1)
            assert rack_after_restart == "rack1", "Expected rack to still be reported as rack1, but was {}".format(rack_after_restart)
        else:
            node1.start()
            # check node not running
            logger.debug("Waiting for error message in log file")

            if cluster.version() >= '2.2':
                expected_error = "Cannot start node if snitch's rack(.*) differs from previous rack(.*)"
            else:
                expected_error = "Fatal exception during initialization"
            node1.watch_log_for(expected_error, from_mark=mark, timeout=120)


    @since('2.1', max_version='5.0.x')
    def test_failed_snitch_update_gossiping_property_file_snitch(self):
        """
        @jira_ticket CASSANDRA-10243

        Test that we cannot change the rack of a live node with GossipingPropertyFileSnitch.
        """
        self._test_failed_snitch_update(nodes=[3],
                                        snitch_class_name='GossipingPropertyFileSnitch',
                                        snitch_config_file='cassandra-rackdc.properties',
                                        snitch_lines_before=["dc=dc1", "rack=rack1"],
                                        snitch_lines_after=["dc=dc1", "rack=rack2"],
                                        racks=["rack1", "rack1", "rack1"],
                                        error='')

    @since('2.1', max_version='5.0.x')
    def test_failed_snitch_update_property_file_snitch(self):
        """
        @jira_ticket CASSANDRA-10243

        Test that we cannot change the rack of a live node with PropertyFileSnitch.
        """
        self._test_failed_snitch_update(nodes=[3],
                                        snitch_class_name='PropertyFileSnitch',
                                        snitch_config_file='cassandra-topology.properties',
                                        snitch_lines_before=["default=dc1:rack1"],
                                        snitch_lines_after=["default=dc1:rack2"],
                                        racks=["rack1", "rack1", "rack1"],
                                        error='Cannot update data center or rack')

    @since('2.0', max_version='2.1.x')
    def test_failed_snitch_update_yaml_file_snitch(self):
        """
        @jira_ticket CASSANDRA-10243

        Test that we cannot change the rack of a live node with YamlFileNetworkTopologySnitch.
        """
        self._test_failed_snitch_update(nodes=[3],
                                        snitch_class_name='YamlFileNetworkTopologySnitch',
                                        snitch_config_file='cassandra-topology.yaml',
                                        snitch_lines_before=["topology:",
                                                             "  - dc_name: dc1",
                                                             "    racks:",
                                                             "    - rack_name: rack1",
                                                             "      nodes:",
                                                             "      - broadcast_address: 127.0.0.1",
                                                             "      - broadcast_address: 127.0.0.2",
                                                             "      - broadcast_address: 127.0.0.3"],
                                        snitch_lines_after=["topology:",
                                                            "  - dc_name: dc1",
                                                            "    racks:",
                                                            "    - rack_name: rack2",
                                                            "      nodes:",
                                                            "      - broadcast_address: 127.0.0.1",
                                                            "      - broadcast_address: 127.0.0.2",
                                                            "      - broadcast_address: 127.0.0.3"],
                                        racks=["rack1", "rack1", "rack1"],
                                        error='Cannot update data center or rack')

    def _test_failed_snitch_update(self, nodes, snitch_class_name, snitch_config_file,
                                   snitch_lines_before, snitch_lines_after, racks, error):
        cluster = self.cluster
        cluster.populate(nodes)
        cluster.set_configuration_options(values={'endpoint_snitch': 'org.apache.cassandra.locator.{}'
                                                  .format(snitch_class_name)})

        # start with initial snitch lines
        for node in cluster.nodelist():
            with open(os.path.join(node.get_conf_dir(), snitch_config_file), 'w') as topo_file:
                for line in snitch_lines_before:
                    topo_file.write(line + os.linesep)

        cluster.start()

        # check racks are as specified
        self.wait_for_nodes_on_racks(cluster.nodelist(), racks)

        marks = [node.mark_log() for node in cluster.nodelist()]

        logger.debug("Updating snitch file")
        for node in cluster.nodelist():
            with open(os.path.join(node.get_conf_dir(), snitch_config_file), 'w') as topo_file:
                for line in snitch_lines_after:
                    topo_file.write(line + os.linesep)

        # wait until the config is reloaded, the default check period is
        # 5 seconds so we wait for 10 seconds to be sure
        logger.debug("Waiting 10 seconds to make sure snitch file is reloaded...")
        time.sleep(10)

        # check racks have not changed
        self.wait_for_nodes_on_racks(cluster.nodelist(), racks)

        # check error in log files if applicable
        if error:
            for node, mark in zip(cluster.nodelist(), marks):
                node.watch_log_for(error, from_mark=mark)

    def test_switch_data_center_startup_fails(self):
        """
        @jira_ticket CASSANDRA-9474

        Confirm that switching data centers fails to bring up the node.
        """
        expected_error = (r"Cannot start node if snitch's data center (.*) differs from previous data center (.*)\. "
                          "Please fix the snitch configuration, decommission and rebootstrap this node "
                          "or use the flag -Dcassandra.ignore_dc=true.")
        self.fixture_dtest_setup.ignore_log_patterns = [expected_error]

        cluster = self.cluster
        cluster.populate(1)
        cluster.set_configuration_options(values={'endpoint_snitch': 'org.apache.cassandra.locator.GossipingPropertyFileSnitch'})

        node = cluster.nodelist()[0]
        with open(os.path.join(node.get_conf_dir(), 'cassandra-rackdc.properties'), 'w') as topo_file:
            topo_file.write("dc=dc9" + os.linesep)
            topo_file.write("rack=rack1" + os.linesep)

        cluster.start()

        node.stop()

        with open(os.path.join(node.get_conf_dir(), 'cassandra-rackdc.properties'), 'w') as topo_file:
            topo_file.write("dc=dc0" + os.linesep)
            topo_file.write("rack=rack1" + os.linesep)

        mark = node.mark_log()
        if self.cluster.version() >= LooseVersion('5.1'):
            node.start(wait_for_binary_proto=True)
            regex = re.compile(r"^Datacenter:\s?(.*)$", re.IGNORECASE)
            out, err, _ = node.nodetool("status")
            dc_after_restart = ""
            for line in out.split(os.linesep):
                m = regex.match(line)
                if m:
                    dc_after_restart = m.group(1)
            assert dc_after_restart == "dc9", "Expected datacenter to still be reported as dc9, but was {}".format(dc_after_restart)
        else:
            node.start()
            node.watch_log_for(expected_error, from_mark=mark, timeout=120)