特来电混沌工程实践-混沌事件注入
上篇博文特来电混沌工程实践中,我们详细介绍了特来电混沌工程实践的设计和规划。目前我们已经启动了应用层的混沌实验。
在应用层的混沌实验中,我们经常需要模拟HSF服务容器的线程被打爆、CPU使用率25%,50%,75%、端口被打爆、内存泄露、服务超时、服务异常等场景。
前期我们实现的时候,一般会选择一个典型的HSF服务去模拟注入上述混沌事件场景。但是每次注入,准备时间长、耗时长、控制复杂,遇到这些问题。
后来和阿里的中亭老师交流,收获到了启发,我们应该写一个混沌事件注入工具。然后根据混沌实验场景,灵活的注入混沌事件。
因此,我们启动了混沌实验注入工具的研发。先说一下具体的思路吧:
- 统一混沌事件的注入接口,实现各类混沌事件注入
- 设计一个统一的混沌事件注入器,支持各类混沌事件注入,支持混沌事件的热更新和取消
- 在HSF、API网关、中间件SDK层面依赖注入混沌事件注入器
一、统一混沌事件的注入接口,实现各类混沌事件注入
1. 先定义混沌事件注入接口IChaosEvent,包含两个方法Inject注入和Stop停止
1 interface IChaosEvent 2 { 3 void Inject(Dictionary<string, string> context); 4 5 void Stop(); 6 }
同时增加一个混沌事件枚举ChaosEventType
public enum ChaosEventType { CPU25, CPU50, CPU75, ServiceTimeout, ServiceException, Memory, Threads, Ports }
2. 实现各类混沌事件注入
HighCpu-25%CPU使用率
class Chaos_HighCPU25 : IChaosEvent { CancellationTokenSource cts; public Chaos_HighCPU25() { cts = new CancellationTokenSource(); } public void Inject(Dictionary<string, string> context) { try { var count = (25 / 100.0) * Environment.ProcessorCount; for (int i = 0; i < count; i++) { var cpuTask = new Task(() => { while (true && cts.IsCancellationRequested == false) { } }, cts.Token, TaskCreationOptions.LongRunning); cpuTask.Start(); } } catch { } } public void Stop() { cts.Cancel(); } }
HighCpu-50%CPU使用率
class Chaos_HighCPU50 : IChaosEvent { CancellationTokenSource cts; public Chaos_HighCPU50() { cts = new CancellationTokenSource(); } public void Inject(Dictionary<string, string> context) { try { var count = (50 / 100.0) * Environment.ProcessorCount; for (int i = 0; i < count; i++) { var cpuTask = new Task(() => { while (true && cts.IsCancellationRequested == false) { } }, cts.Token, TaskCreationOptions.LongRunning); cpuTask.Start(); } } catch { } } public void Stop() { cts.Cancel(); } }
HighCpu-75%CPU使用率
class Chaos_HighCPU75 : IChaosEvent { CancellationTokenSource cts; public Chaos_HighCPU75() { cts = new CancellationTokenSource(); } public void Inject(Dictionary<string, string> context) { try { var count = (75 / 100.0) * Environment.ProcessorCount; for (int i = 0; i < count; i++) { var cpuTask = new Task(() => { while (true && cts.IsCancellationRequested == false) { } }, cts.Token, TaskCreationOptions.LongRunning); cpuTask.Start(); } } catch { } } public void Stop() { cts.Cancel(); } }
内存泄露-2G
class Chaos_Memory : IChaosEvent { CancellationTokenSource cts; static string OneKB = "111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111"; static List<string> list = new List<string>(); public Chaos_Memory() { cts = new CancellationTokenSource(); } public void Inject(Dictionary<string, string> context) { try { var count = System.Configuration.ConfigurationManager.AppSettings["Chaos.MemoryMB"]; if (count == null) count = "2000"; int c; if (int.TryParse(count, out c)) { Task task = new Task( () => { for (int k = 0; k < c / 2; k++) { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 1024; i++) { builder.Append(OneKB); } list.Add(builder.ToString()); } }, cts.Token, TaskCreationOptions.LongRunning); task.Start(); } } catch { } } public void Stop() { cts.Cancel(); list.Clear(); list = new List<string>(); } }
端口被打爆:
class Chaos_Ports : IChaosEvent { CancellationTokenSource cts; static Listsockets; public Chaos_Ports() { cts = new CancellationTokenSource(); sockets = new List (); } public void Inject(Dictionary<string, string> context) { try { var count = Convert.ToInt32(context["Count"]); var server = Convert.ToString(context["Server"]); var sp = server.Split(':'); var task = Task.Factory.StartNew(() => { for (int i = 0; i < count; i++) { try { Socket socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); socket.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.KeepAlive, true); socket.Connect(sp[0], Convert.ToInt32(sp[1])); SetKeepAliveValues(socket, true, 36000000, 1000); sockets.Add(socket); } catch (Exception e) { } } while (cts.IsCancellationRequested == false) { Thread.Sleep(20 * 60000); } }, TaskCreationOptions.LongRunning); } catch { } } public void Stop() { cts.Cancel(); if (sockets != null) { foreach (var socket in sockets) { try { socket.Close(); } catch { } } sockets.Clear(); sockets = null; } } public int SetKeepAliveValues ( System.Net.Sockets.Socket Socket, bool On_Off, uint KeepaLiveTime, uint KeepaLiveInterval ) { int Result = -1; unsafe { TcpKeepAlive KeepAliveValues = new TcpKeepAlive(); KeepAliveValues.On_Off = Convert.ToUInt32(On_Off); KeepAliveValues.KeepaLiveTime = KeepaLiveTime; KeepAliveValues.KeepaLiveInterval = KeepaLiveInterval; byte[] InValue = new byte[12]; for (int I = 0; I < 12; I++) InValue[I] = KeepAliveValues.Bytes[I]; Result = Socket.IOControl(IOControlCode.KeepAliveValues, InValue, null); } return Result; } } [ System.Runtime.InteropServices.StructLayout ( System.Runtime.InteropServices.LayoutKind.Explicit ) ] unsafe struct TcpKeepAlive { [System.Runtime.InteropServices.FieldOffset(0)] [ System.Runtime.InteropServices.MarshalAs ( System.Runtime.InteropServices.UnmanagedType.ByValArray, SizeConst = 12 ) ] public fixed byte Bytes[12]; [System.Runtime.InteropServices.FieldOffset(0)] public uint On_Off; [System.Runtime.InteropServices.FieldOffset(4)] public uint KeepaLiveTime; [System.Runtime.InteropServices.FieldOffset(8)] public uint KeepaLiveInterval; }
线程被打爆:
class Chaos_Threads : IChaosEvent { CancellationTokenSource cts; Listtasks = new List (); public Chaos_Threads() { cts = new CancellationTokenSource(); } public void Inject(Dictionary<string, string> context) { try { var count = context["Threads"]; int c; if (Int32.TryParse(count, out c)) { //Parallel.For(0, c, new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, (i) => for (int i = 0; i < c; i++) { var task = new Task(() => { for (int j = 0; j < 120; j++) { if (cts.IsCancellationRequested) return; Thread.Sleep(10 * 1000); } }, cts.Token); task.Start(); tasks.Add(task); } } } catch { } } public void Stop() { cts.Cancel(); if (tasks != null) { foreach (var task in tasks) { try { task.Dispose(); } catch { } } } } }
服务调用异常:
class Chaos_ServiceException : IChaosEvent { bool isStop = false; public Chaos_ServiceException() { } public void Inject(Dictionary<string, string> context) { if (isStop == false) throw new Exception("Chaos_ServiceException"); } public void Stop() { isStop = true; } }
服务调用超时:
class Chaos_ServiceTimeout : IChaosEvent { CancellationTokenSource cts; bool isStop = false; public Chaos_ServiceTimeout() { cts = new CancellationTokenSource(); } public void Inject(Dictionary<string, string> context) { if (isStop == false) Task.Delay(10 * 1000, cts.Token).Wait(); } public void Stop() { cts.Cancel(); isStop = true; } }
二、设计一个统一的混沌事件注入器,支持各类混沌事件注入,支持混沌事件的热更新和取消
1. ChaosEventInjecter
支持混沌事件接口实现的创建、混沌事件注入(全局注入一次,每次调用都注入)、混沌事件取消(停止)
混沌事件接口实现的创建
private IChaosEvent GetOrCreateChaosEvent(ChaosEventType chaosEventType) { if (!eventDic.ContainsKey(chaosEventType)) { lock (syncObj) { if (!eventDic.ContainsKey(chaosEventType)) { IChaosEvent chaosEvent = null; switch (chaosEventType) { case ChaosEventType.CPU75: chaosEvent = new Chaos_HighCPU75(); break; case ChaosEventType.CPU50: chaosEvent = new Chaos_HighCPU50(); break; case ChaosEventType.CPU25: chaosEvent = new Chaos_HighCPU25(); break; case ChaosEventType.Memory: chaosEvent = new Chaos_Memory(); break; case ChaosEventType.Threads: chaosEvent = new Chaos_Threads(); break; case ChaosEventType.ServiceException: chaosEvent = new Chaos_ServiceException(); break; case ChaosEventType.ServiceTimeout: chaosEvent = new Chaos_ServiceTimeout(); break; case ChaosEventType.Ports: chaosEvent = new Chaos_Ports(); break; default: break; } if (chaosEvent != null) { eventDic.TryAdd(chaosEventType, chaosEvent); return chaosEvent; } } } } return eventDic[chaosEventType]; }
混沌事件注入(全局注入一次,每次调用都注入)
1 private static object syncObj = new object(); 2 private static object eventObj = new object(); 3 4 private static ChaosEventInjecter instance; 5 private ConcurrentDictionaryeventDic; 6 7 private ConcurrentDictionary triggeredEvent; 8 9 private ChaosEventInjecter() 10 { 11 eventDic = new ConcurrentDictionary (); 12 triggeredEvent = new ConcurrentDictionary (); 13 }
public void SingletonInject(ChaosEventType chaosEventType, Dictionary<string, string> context = null) { if (!triggeredEvent.ContainsKey(chaosEventType)) { lock (eventObj) { if (!triggeredEvent.ContainsKey(chaosEventType)) { var chaosEvent = GetOrCreateChaosEvent(chaosEventType); if (chaosEvent == null) return; chaosEvent.Inject(context); triggeredEvent.TryAdd(chaosEventType, chaosEventType); } } } }
按服务每次调用都注入
public void ServiceInject(List<string> serviceId) { ChaosEventManager.GetIntance().StopInject = StopInject; if (ChaosEventManager.GetIntance().IsEmpty()) { StopInject(); } foreach (var service in serviceId) { var chaosEvent = ChaosEventManager.GetIntance().GetChaosEvent(service); if (chaosEvent != null) { switch (chaosEvent.ChaosEventType) { case ChaosEventType.ServiceException: case ChaosEventType.ServiceTimeout: Inject(chaosEvent.ChaosEventType, chaosEvent.ChaosValue); break; default: SingletonInject(chaosEvent.ChaosEventType, chaosEvent.ChaosValue); break; } } } }
停止混沌注入
public void StopInject() { if (triggeredEvent == null && triggeredEvent.Count == 0) return; foreach (var chaosEventType in triggeredEvent) { var chaosEvent = GetOrCreateChaosEvent(chaosEventType.Key); if (chaosEvent == null) return; chaosEvent.Stop(); } triggeredEvent = new ConcurrentDictionary(); }
完整的ChaosEventInjecter代码:
////// 混沌事件注入器 /// public class ChaosEventInjecter { private static object syncObj = new object(); private static object eventObj = new object(); private static ChaosEventInjecter instance; private ConcurrentDictionary eventDic; private ConcurrentDictionary triggeredEvent; private ChaosEventInjecter() { eventDic = new ConcurrentDictionary (); triggeredEvent = new ConcurrentDictionary (); } public static ChaosEventInjecter GetIntance() { if (instance == null) { lock (syncObj) { if (instance == null) { instance = new ChaosEventInjecter(); } } } return instance; } public void SingletonInject() { var eventType = System.Configuration.ConfigurationManager.AppSettings["Chaos.Event"]; if (eventType != null) { ChaosEventType chaosEvent = (ChaosEventType)Enum.Parse(typeof(ChaosEventType), eventType.ToString()); SingletonInject(chaosEvent); } } public void ServiceInject(List<string> serviceId) { ChaosEventManager.GetIntance().StopInject = StopInject; if (ChaosEventManager.GetIntance().IsEmpty()) { StopInject(); } foreach (var service in serviceId) { var chaosEvent = ChaosEventManager.GetIntance().GetChaosEvent(service); if (chaosEvent != null) { switch (chaosEvent.ChaosEventType) { case ChaosEventType.ServiceException: case ChaosEventType.ServiceTimeout: Inject(chaosEvent.ChaosEventType, chaosEvent.ChaosValue); break; default: SingletonInject(chaosEvent.ChaosEventType, chaosEvent.ChaosValue); break; } } } } public void SingletonInject(ChaosEventType chaosEventType, Dictionary<string, string> context = null) { if (!triggeredEvent.ContainsKey(chaosEventType)) { lock (eventObj) { if (!triggeredEvent.ContainsKey(chaosEventType)) { var chaosEvent = GetOrCreateChaosEvent(chaosEventType); if (chaosEvent == null) return; chaosEvent.Inject(context); triggeredEvent.TryAdd(chaosEventType, chaosEventType); } } } } public void StopInject() { if (triggeredEvent == null && triggeredEvent.Count == 0) return; foreach (var chaosEventType in triggeredEvent) { var chaosEvent = GetOrCreateChaosEvent(chaosEventType.Key); if (chaosEvent == null) return; chaosEvent.Stop(); } triggeredEvent = new ConcurrentDictionary (); } public void Inject(ChaosEventType chaosEventType, Dictionary<string, string> context = null) { var chaosEvent = GetOrCreateChaosEvent(chaosEventType); if (chaosEvent == null) return; chaosEvent.Inject(context); } private IChaosEvent GetOrCreateChaosEvent(ChaosEventType chaosEventType) { if (!eventDic.ContainsKey(chaosEventType)) { lock (syncObj) { if (!eventDic.ContainsKey(chaosEventType)) { IChaosEvent chaosEvent = null; switch (chaosEventType) { case ChaosEventType.CPU75: chaosEvent = new Chaos_HighCPU75(); break; case ChaosEventType.CPU50: chaosEvent = new Chaos_HighCPU50(); break; case ChaosEventType.CPU25: chaosEvent = new Chaos_HighCPU25(); break; case ChaosEventType.Memory: chaosEvent = new Chaos_Memory(); break; case ChaosEventType.Threads: chaosEvent = new Chaos_Threads(); break; case ChaosEventType.ServiceException: chaosEvent = new Chaos_ServiceException(); break; case ChaosEventType.ServiceTimeout: chaosEvent = new Chaos_ServiceTimeout(); break; case ChaosEventType.Ports: chaosEvent = new Chaos_Ports(); break; default: break; } if (chaosEvent != null) { eventDic.TryAdd(chaosEventType, chaosEvent); return chaosEvent; } } } } return eventDic[chaosEventType]; } }
2. ChaosEventManager
混沌事件管理类,负责从Redis中实时获取每个服务配置的混沌事件,支持混沌事件的定时更新10s:
class ChaosEventManager { private static object syncObj = new object(); private static ChaosEventManager instance; private ConcurrentDictionary<string, ChaosEvent> eventDic; CacheService service = CacheService.GetInstance("DefaultPool"); public Action StopInject { get; set; } private ChaosEventManager() { eventDic = new ConcurrentDictionary<string, ChaosEvent>(); GetAllChaosEvents(); StartUpdateTask(); } private void StartUpdateTask() { var task = new Task(() => { while (true) { Thread.Sleep(10000); GetAllChaosEvents(); } }, TaskCreationOptions.LongRunning); task.ContinueWith((t) => { if (t.IsFaulted) StartUpdateTask(); }); task.Start(); } public static ChaosEventManager GetIntance() { if (instance == null) { lock (syncObj) { if (instance == null) { instance = new ChaosEventManager(); } } } return instance; } public ChaosEvent GetChaosEvent(string serviceId) { if (eventDic.ContainsKey(serviceId)) return eventDic[serviceId]; else return null; } public bool IsEmpty() { return eventDic == null || eventDic.Count == 0; } private void GetAllChaosEvents() { var newEventDic = new ConcurrentDictionary<string, ChaosEvent>(); using (var client = service.GetClient()) { List<string> keys = client.GetHashKeys("ChaosEvents"); if (keys != null) keys.ForEach(x => newEventDic.TryAdd(x, client.GetValueFromHash("ChaosEvents", x))); } foreach (var item in newEventDic) { eventDic[item.Key] = item.Value; } if (newEventDic.Count == 0) if (StopInject != null) StopInject(); } }
三、在HSF、API网关、中间件SDK层面依赖注入混沌事件注入器
在HSF服务调用时增加混沌实验事件AOP注入
API网关、中间件SDK类似的方法进行注入。
同时我们设计了一个混沌事件注入工具:支持混沌事件实时注入、取消、仿真模拟执行:
分享:仿真模拟执行的效果,例如CPU25%使用率:
以上工具和设计思路,分享给大家。
周国庆
2019/3/30