|
2 | 2 |
|
3 | 3 | require_relative 'rspec_helper' |
4 | 4 | require 'pp' |
| 5 | +require 'timeout' |
5 | 6 |
|
6 | 7 | class Sharded < ActiveRecord::Base |
7 | 8 | self.table_name = 'sharded' |
@@ -187,4 +188,129 @@ def conn(db, prepared) |
187 | 188 | end |
188 | 189 | end |
189 | 190 | end |
| 191 | + |
| 192 | + describe 'chaos testing with interrupted queries' do |
| 193 | + before do |
| 194 | + conn('failover', false) |
| 195 | + ActiveRecord::Base.connection.execute 'DROP TABLE IF EXISTS sharded' |
| 196 | + ActiveRecord::Base.connection.execute 'CREATE TABLE sharded (id BIGSERIAL PRIMARY KEY, value TEXT)' |
| 197 | + end |
| 198 | + |
| 199 | + it 'handles interrupted queries and continues operating normally' do |
| 200 | + interrupted_count = 0 |
| 201 | + successful_count = 0 |
| 202 | + mutex = Mutex.new |
| 203 | + |
| 204 | + # Apply latency toxic to slow down query transmission, |
| 205 | + # making it easier to interrupt queries mid-flight |
| 206 | + Toxiproxy[:primary].toxic(:latency, latency: 100, jitter: 50).apply do |
| 207 | + # Phase 1: Chaos - interrupt queries randomly with thread kills |
| 208 | + chaos_threads = [] |
| 209 | + killer_threads = [] |
| 210 | + |
| 211 | + # Start 10 query threads |
| 212 | + 10.times do |thread_id| |
| 213 | + t = Thread.new do |
| 214 | + 100.times do |i| |
| 215 | + begin |
| 216 | + case rand(3) |
| 217 | + when 0 |
| 218 | + # SELECT query |
| 219 | + Sharded.where('id > ?', 0).limit(10).to_a |
| 220 | + when 1 |
| 221 | + # INSERT query |
| 222 | + Sharded.create value: "thread_#{thread_id}_iter_#{i}" |
| 223 | + when 2 |
| 224 | + # Transaction with multiple operations |
| 225 | + Sharded.transaction do |
| 226 | + rec = Sharded.create value: "tx_#{thread_id}_#{i}" |
| 227 | + Sharded.where(id: rec.id).first if rec.id |
| 228 | + end |
| 229 | + end |
| 230 | + mutex.synchronize { successful_count += 1 } |
| 231 | + rescue StandardError => e |
| 232 | + # Killed mid-query or other error |
| 233 | + mutex.synchronize { interrupted_count += 1 } |
| 234 | + end |
| 235 | + end |
| 236 | + end |
| 237 | + chaos_threads << t |
| 238 | + end |
| 239 | + |
| 240 | + # Start killer thread that randomly kills query threads |
| 241 | + killer = Thread.new do |
| 242 | + 50.times do |
| 243 | + sleep(rand(0.01..0.05)) |
| 244 | + alive_threads = chaos_threads.select(&:alive?) |
| 245 | + if alive_threads.any? |
| 246 | + victim = alive_threads.sample |
| 247 | + victim.kill |
| 248 | + mutex.synchronize { interrupted_count += 1 } |
| 249 | + end |
| 250 | + end |
| 251 | + end |
| 252 | + killer_threads << killer |
| 253 | + |
| 254 | + # Wait for killer to finish |
| 255 | + killer_threads.each(&:join) |
| 256 | + |
| 257 | + # Wait for remaining threads (with timeout) |
| 258 | + chaos_threads.each { |t| t.join(0.1) } |
| 259 | + |
| 260 | + puts "Chaos phase complete: #{successful_count} successful, #{interrupted_count} interrupted" |
| 261 | + expect(interrupted_count).to be > 0 |
| 262 | + end # End toxiproxy latency |
| 263 | + |
| 264 | + # Give PgDog time to clean up broken connections |
| 265 | + sleep(0.5) |
| 266 | + |
| 267 | + # Disconnect all connections to clear bad state |
| 268 | + ActiveRecord::Base.connection_pool.disconnect! |
| 269 | + |
| 270 | + # Wait a bit more for cleanup |
| 271 | + sleep(0.5) |
| 272 | + |
| 273 | + # Phase 2: Verify database continues to operate normally |
| 274 | + verification_errors = [] |
| 275 | + errors_mutex = Mutex.new |
| 276 | + |
| 277 | + verification_threads = 10.times.map do |thread_id| |
| 278 | + Thread.new do |
| 279 | + 20.times do |i| |
| 280 | + begin |
| 281 | + # Simple queries that don't depend on finding specific records |
| 282 | + # INSERT |
| 283 | + rec = Sharded.create value: "verify_#{thread_id}_#{i}" |
| 284 | + expect(rec.id).to be > 0 |
| 285 | + |
| 286 | + # SELECT with basic query |
| 287 | + results = Sharded.where('value LIKE ?', 'verify_%').limit(5).to_a |
| 288 | + expect(results).to be_a(Array) |
| 289 | + |
| 290 | + # COUNT query |
| 291 | + count = Sharded.where('id > ?', 0).count |
| 292 | + expect(count).to be >= 0 |
| 293 | + rescue PG::Error => e |
| 294 | + # PG errors should fail the test |
| 295 | + raise |
| 296 | + rescue StandardError => e |
| 297 | + errors_mutex.synchronize { verification_errors << e } |
| 298 | + end |
| 299 | + end |
| 300 | + end |
| 301 | + end |
| 302 | + |
| 303 | + verification_threads.each(&:join) |
| 304 | + |
| 305 | + # Verify no errors occurred during verification |
| 306 | + expect(verification_errors).to be_empty, "Verification errors: #{verification_errors.map(&:message).join(', ')}" |
| 307 | + |
| 308 | + # Verify we can still execute basic queries |
| 309 | + ActiveRecord::Base.connection.execute('SELECT 1') |
| 310 | + |
| 311 | + # Verify count works |
| 312 | + count = Sharded.count |
| 313 | + expect(count).to be >= 0 |
| 314 | + end |
| 315 | + end |
190 | 316 | end |
0 commit comments