首页 > 解决方案 > 启动第二个节点后的 Cassandra 断言错误

问题描述

我有一个 Cassandra 集群,它有两个节点,具有简单的复制策略。

一切运行良好,直到其中一个节点崩溃。我通过克隆剩余的节点虚拟机恢复了崩溃的节点(所以我们克隆了一个文件系统),并更新了监听和 RPC 地址。

现在我不断收到以下奇怪的错误。

当我运行每个单个节点时,一切都运行良好。但是当我启动第二个节点时,第一个节点会出现错误!

ERROR [Native-Transport-Requests-1] 2020-07-21 08:19:31,042 Message.java:693 - Unexpected exception during request; channel = [id: 0xc1935e7a, L:/192.168.40.15:9042 - R:/192.168.40.15:47980]
java.lang.AssertionError: null
    at org.apache.cassandra.locator.TokenMetadata.firstTokenIndex(TokenMetadata.java:1065) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.locator.TokenMetadata.firstToken(TokenMetadata.java:1079) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.locator.AbstractReplicationStrategy.getNaturalEndpoints(AbstractReplicationStrategy.java:107) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.service.StorageService.getLiveNaturalEndpoints(StorageService.java:3866) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.service.StorageService.getLiveNaturalEndpoints(StorageService.java:3852) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.service.StorageProxy.getLiveSortedEndpoints(StorageProxy.java:1914) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.service.StorageProxy$RangeIterator.computeNext(StorageProxy.java:1992) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.service.StorageProxy$RangeIterator.computeNext(StorageProxy.java:1962) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.utils.AbstractIterator.hasNext(AbstractIterator.java:47) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at com.google.common.collect.Iterators$PeekingImpl.hasNext(Iterators.java:1149) ~[guava-18.0.jar:na]
    at org.apache.cassandra.service.StorageProxy$RangeMerger.computeNext(StorageProxy.java:2014) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.service.StorageProxy$RangeMerger.computeNext(StorageProxy.java:1999) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.utils.AbstractIterator.hasNext(AbstractIterator.java:47) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.service.StorageProxy$RangeCommandIterator.computeNext(StorageProxy.java:2132) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.service.StorageProxy$RangeCommandIterator.computeNext(StorageProxy.java:2092) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.utils.AbstractIterator.hasNext(AbstractIterator.java:47) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.db.transform.BasePartitions.hasNext(BasePartitions.java:92) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.cql3.statements.SelectStatement.process(SelectStatement.java:786) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.cql3.statements.SelectStatement.processResults(SelectStatement.java:438) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.cql3.statements.SelectStatement.execute(SelectStatement.java:416) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.cql3.statements.SelectStatement.execute(SelectStatement.java:289) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.cql3.statements.SelectStatement.execute(SelectStatement.java:117) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.cql3.QueryProcessor.processStatement(QueryProcessor.java:225) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.cql3.QueryProcessor.process(QueryProcessor.java:256) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.cql3.QueryProcessor.process(QueryProcessor.java:241) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.transport.messages.QueryMessage.execute(QueryMessage.java:116) ~[apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.transport.Message$Dispatcher.channelRead0(Message.java:566) [apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.transport.Message$Dispatcher.channelRead0(Message.java:410) [apache-cassandra-3.11.4.jar:3.11.4]
    at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105) [netty-all-4.0.44.Final.jar:4.0.44.Final]
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357) [netty-all-4.0.44.Final.jar:4.0.44.Final]
    at io.netty.channel.AbstractChannelHandlerContext.access$600(AbstractChannelHandlerContext.java:35) [netty-all-4.0.44.Final.jar:4.0.44.Final]
    at io.netty.channel.AbstractChannelHandlerContext$7.run(AbstractChannelHandlerContext.java:348) [netty-all-4.0.44.Final.jar:4.0.44.Final]
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [na:1.8.0_252]
    at org.apache.cassandra.concurrent.AbstractLocalAwareExecutorService$FutureTask.run(AbstractLocalAwareExecutorService.java:162) [apache-cassandra-3.11.4.jar:3.11.4]
    at org.apache.cassandra.concurrent.SEPWorker.run(SEPWorker.java:114) [apache-cassandra-3.11.4.jar:3.11.4]
    at java.lang.Thread.run(Thread.java:748) [na:1.8.0_252]

我正在使用以下 Cassandra 版本:[cqlsh 5.0.1 | 卡桑德拉 3.11.4 | CQL 规范 3.4.4 | 本机协议 v4]

以下是配置文件:

cassandra.yaml

cluster_name: 'babelfish'

num_tokens: 256



hinted_handoff_enabled: true



hinted_handoff_throttle_in_kb: 1024

max_hints_delivery_threads: 2


hints_flush_period_in_ms: 10000

max_hints_file_size_in_mb: 128


batchlog_replay_throttle_in_kb: 1024

authenticator: AllowAllAuthenticator

authorizer: AllowAllAuthorizer

role_manager: CassandraRoleManager

roles_validity_in_ms: 2000


permissions_validity_in_ms: 2000


credentials_validity_in_ms: 2000


partitioner: org.apache.cassandra.dht.Murmur3Partitioner

data_file_directories:
    - /var/lib/cassandra/data

commitlog_directory: /var/lib/cassandra/commitlog

cdc_enabled: false


disk_failure_policy: stop

commit_failure_policy: stop

prepared_statements_cache_size_mb:

thrift_prepared_statements_cache_size_mb:

key_cache_size_in_mb:

key_cache_save_period: 14400



row_cache_size_in_mb: 0

row_cache_save_period: 0


counter_cache_size_in_mb:

counter_cache_save_period: 7200


saved_caches_directory: /var/lib/cassandra/saved_caches

commitlog_sync: periodic
commitlog_sync_period_in_ms: 10000

commitlog_segment_size_in_mb: 32


seed_provider:
    - class_name: org.apache.cassandra.locator.SimpleSeedProvider
      parameters:
          - seeds: "192.168.30.15, 192.168.40.15"

concurrent_reads: 32
concurrent_writes: 32
concurrent_counter_writes: 32

concurrent_materialized_view_writes: 32







memtable_allocation_type: heap_buffers





index_summary_capacity_in_mb:

index_summary_resize_interval_in_minutes: 60

trickle_fsync: false
trickle_fsync_interval_in_kb: 10240

storage_port: 7000

ssl_storage_port: 7001

listen_address: 192.168.40.15






start_native_transport: true
native_transport_port: 9042



start_rpc: false

rpc_address: 192.168.40.15



rpc_port: 9160


rpc_keepalive: true

rpc_server_type: sync





thrift_framed_transport_size_in_mb: 15

incremental_backups: false

snapshot_before_compaction: false

auto_snapshot: true

column_index_size_in_kb: 64

column_index_cache_size_in_kb: 2


compaction_throughput_mb_per_sec: 16

sstable_preemptive_open_interval_in_mb: 50



read_request_timeout_in_ms: 5000
range_request_timeout_in_ms: 10000
write_request_timeout_in_ms: 2000
counter_write_request_timeout_in_ms: 5000
cas_contention_timeout_in_ms: 1000
truncate_request_timeout_in_ms: 60000
request_timeout_in_ms: 10000

slow_query_log_timeout_in_ms: 500

cross_node_timeout: false



endpoint_snitch: GossipingPropertyFileSnitch

dynamic_snitch_update_interval_in_ms: 100
dynamic_snitch_reset_interval_in_ms: 600000
dynamic_snitch_badness_threshold: 0.1

request_scheduler: org.apache.cassandra.scheduler.NoScheduler



server_encryption_options:
    internode_encryption: none
    keystore: conf/.keystore
    keystore_password: cassandra
    truststore: conf/.truststore
    truststore_password: cassandra

client_encryption_options:
    enabled: false
    optional: false
    keystore: conf/.keystore
    keystore_password: cassandra

internode_compression: dc

inter_dc_tcp_nodelay: false

tracetype_query_ttl: 86400
tracetype_repair_ttl: 604800


enable_user_defined_functions: false

enable_scripted_user_defined_functions: false

enable_materialized_views: true

windows_timer_interval: 1


transparent_data_encryption_options:
    enabled: false
    chunk_length_kb: 64
    cipher: AES/CBC/PKCS5Padding
    key_alias: testing:1
    key_provider:
      - class_name: org.apache.cassandra.security.JKSKeyProvider
        parameters:
          - keystore: conf/.keystore
            keystore_password: cassandra
            store_type: JCEKS
            key_password: cassandra



tombstone_warn_threshold: 1000
tombstone_failure_threshold: 100000

batch_size_warn_threshold_in_kb: 5

batch_size_fail_threshold_in_kb: 50

unlogged_batch_across_partitions_warn_threshold: 10

compaction_large_partition_warning_threshold_mb: 100

gc_warn_threshold_in_ms: 1000


back_pressure_enabled: false
back_pressure_strategy:
    - class_name: org.apache.cassandra.net.RateBasedBackPressure
      parameters:
        - high_ratio: 0.90
          factor: 5
          flow: FAST

cassandra-rackdc.properties

# These properties are used with GossipingPropertyFileSnitch and will
# indicate the rack and dc for this node
dc=DC1
rack=RACK1

# Add a suffix to a datacenter name. Used by the Ec2Snitch and Ec2MultiRegionSnitch
# to append a string to the EC2 region name.
#dc_suffix=

# Uncomment the following line to make this snitch prefer the internal ip when possible, as the Ec2MultiRegionSnitch does.
# prefer_local=true

cassandra-topology.properties

# Cassandra Node IP=Data Center:Rack
192.168.30.15=DC1:RACK1
192.168.40.15=DC1:RACK1

# default for unknown nodes
default=DC1:r1

# Native IPv6 is supported, however you must escape the colon in the IPv6 Address
# Also be sure to comment out JVM_OPTS="$JVM_OPTS -Djava.net.preferIPv4Stack=true"
# in cassandra-env.sh
# fe80\:0\:0\:0\:202\:b3ff\:fe1e\:8329=DC1:RAC3

此错误的根源可能是什么,如何解决?

标签: cassandraassertion

解决方案


如果您克隆了包含所有数据的虚拟机,那么您将拥有第一个节点的所有数据,包括节点的 ID。为了解决这个问题,关闭第二个节点,删除所有数据data_file_directories和提交日志,只留下第一个节点作为种子节点,然后启动第二个节点,这样它就会正常加入集群,这个过程完成后,更新种子列表(如果您将第二个节点留在种子列表中,它不会加入集群,而是引导一个新集群)。


推荐阅读