特来电混沌工程实践-混沌事件注入


上篇博文特来电混沌工程实践中,我们详细介绍了特来电混沌工程实践的设计和规划。目前我们已经启动了应用层的混沌实验。

在应用层的混沌实验中,我们经常需要模拟HSF服务容器的线程被打爆、CPU使用率25%,50%,75%、端口被打爆、内存泄露、服务超时、服务异常等场景。

前期我们实现的时候,一般会选择一个典型的HSF服务去模拟注入上述混沌事件场景。但是每次注入,准备时间长、耗时长、控制复杂,遇到这些问题。

后来和阿里的中亭老师交流,收获到了启发,我们应该写一个混沌事件注入工具。然后根据混沌实验场景,灵活的注入混沌事件。

因此,我们启动了混沌实验注入工具的研发。先说一下具体的思路吧:

  1. 统一混沌事件的注入接口,实现各类混沌事件注入
  2. 设计一个统一的混沌事件注入器,支持各类混沌事件注入,支持混沌事件的热更新和取消
  3. 在HSF、API网关、中间件SDK层面依赖注入混沌事件注入器

一、统一混沌事件的注入接口,实现各类混沌事件注入

  1. 先定义混沌事件注入接口IChaosEvent,包含两个方法Inject注入和Stop停止

1 interface IChaosEvent
2 {
3     void Inject(Dictionary<string, string> context);
4 
5     void Stop();
6 }

  同时增加一个混沌事件枚举ChaosEventType

public enum ChaosEventType
    {
        CPU25,

        CPU50,

        CPU75,

        ServiceTimeout,

        ServiceException,

        Memory,

        Threads,

        Ports
    }

2. 实现各类混沌事件注入

   HighCpu-25%CPU使用率

 class Chaos_HighCPU25 : IChaosEvent
  {
        CancellationTokenSource cts;

        public Chaos_HighCPU25()
        {
            cts = new CancellationTokenSource();
        }

        public void Inject(Dictionary<string, string> context)
        {
            try
            {
                var count = (25 / 100.0) * Environment.ProcessorCount;
                for (int i = 0; i < count; i++)
                {
                    var cpuTask = new Task(() =>
                    {
                        while (true && cts.IsCancellationRequested == false)
                        {

                        }
                    }, cts.Token, TaskCreationOptions.LongRunning);

                    cpuTask.Start();
                }
            }
            catch { }
        }

        public void Stop()
        {
            cts.Cancel();
        }
}

  HighCpu-50%CPU使用率

 class Chaos_HighCPU50 : IChaosEvent
    {
        CancellationTokenSource cts;

        public Chaos_HighCPU50()
        {
            cts = new CancellationTokenSource();
        }

        public void Inject(Dictionary<string, string> context)
        {
            try
            {
                var count = (50 / 100.0) * Environment.ProcessorCount;
                for (int i = 0; i < count; i++)
                {
                    var cpuTask = new Task(() =>
                    {
                        while (true && cts.IsCancellationRequested == false)
                        {

                        }
                    }, cts.Token, TaskCreationOptions.LongRunning);

                    cpuTask.Start();
                }
            }
            catch { }
        }

        public void Stop()
        {
            cts.Cancel();
        }
    }

  HighCpu-75%CPU使用率

class Chaos_HighCPU75 : IChaosEvent
    {
        CancellationTokenSource cts;

        public Chaos_HighCPU75()
        {
            cts = new CancellationTokenSource();
        }

        public void Inject(Dictionary<string, string> context)
        {
            try
            {
                var count = (75 / 100.0) * Environment.ProcessorCount;
                for (int i = 0; i < count; i++)
                {
                    var cpuTask = new Task(() =>
                    {
                        while (true && cts.IsCancellationRequested == false)
                        {

                        }
                    }, cts.Token, TaskCreationOptions.LongRunning);

                    cpuTask.Start();
                }
            }
            catch { }
        }

        public void Stop()
        {
            cts.Cancel();
        }
    }

  内存泄露-2G

  class Chaos_Memory : IChaosEvent
    {
        CancellationTokenSource cts;

        static string OneKB = "111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111";
        static List<string> list = new List<string>();

        public Chaos_Memory()
        {
            cts = new CancellationTokenSource();
        }

        public void Inject(Dictionary<string, string> context)
        {
            try
            {
                var count = System.Configuration.ConfigurationManager.AppSettings["Chaos.MemoryMB"];
                if (count == null) count = "2000";
                int c;
                if (int.TryParse(count, out c))
                {
                    Task task = new Task(
                            () =>
                            {
                                for (int k = 0; k < c / 2; k++)
                                {
                                    StringBuilder builder = new StringBuilder();
                                    for (int i = 0; i < 1024; i++)
                                    {
                                        builder.Append(OneKB);
                                    }
                                    list.Add(builder.ToString());
                                }
                            }, cts.Token, TaskCreationOptions.LongRunning);
                    task.Start();
                }
            }
            catch { }
        }

        public void Stop()
        {
            cts.Cancel();
            list.Clear();
            list = new List<string>();
        }
    }

  端口被打爆:

 class Chaos_Ports : IChaosEvent
    {
        CancellationTokenSource cts;

        static List sockets;

        public Chaos_Ports()
        {
            cts = new CancellationTokenSource();
            sockets = new List();
        }

        public void Inject(Dictionary<string, string> context)
        {
            try
            {
                var count = Convert.ToInt32(context["Count"]);
                var server = Convert.ToString(context["Server"]);
                var sp = server.Split(':');
                var task = Task.Factory.StartNew(() =>
                {
                    for (int i = 0; i < count; i++)
                    {
                        try
                        {
                            Socket socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
                            socket.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.KeepAlive, true);
                            socket.Connect(sp[0], Convert.ToInt32(sp[1]));
                            SetKeepAliveValues(socket, true, 36000000, 1000);

                            sockets.Add(socket);
                        }
                        catch (Exception e) { }
                    }
                    while (cts.IsCancellationRequested == false)
                    {
                        Thread.Sleep(20 * 60000);
                    }
                }, TaskCreationOptions.LongRunning);
            }
            catch { }
        }

        public void Stop()
        {
            cts.Cancel();
            if (sockets != null)
            {
                foreach (var socket in sockets)
                {
                    try
                    {
                        socket.Close();
                    }
                    catch { }
                }

                sockets.Clear();
                sockets = null;
            }
        }

        public int SetKeepAliveValues
          (
               System.Net.Sockets.Socket Socket,
               bool On_Off,
               uint KeepaLiveTime,
               uint KeepaLiveInterval
           )
        {
            int Result = -1;

            unsafe
            {
                TcpKeepAlive KeepAliveValues = new TcpKeepAlive();

                KeepAliveValues.On_Off = Convert.ToUInt32(On_Off);
                KeepAliveValues.KeepaLiveTime = KeepaLiveTime;
                KeepAliveValues.KeepaLiveInterval = KeepaLiveInterval;

                byte[] InValue = new byte[12];

                for (int I = 0; I < 12; I++)
                    InValue[I] = KeepAliveValues.Bytes[I];

                Result = Socket.IOControl(IOControlCode.KeepAliveValues, InValue, null);
            }

            return Result;
        }
    }
    [
           System.Runtime.InteropServices.StructLayout
           (
               System.Runtime.InteropServices.LayoutKind.Explicit
           )
    ]
    unsafe struct TcpKeepAlive
    {
        [System.Runtime.InteropServices.FieldOffset(0)]
        [
              System.Runtime.InteropServices.MarshalAs
               (
                   System.Runtime.InteropServices.UnmanagedType.ByValArray,
                   SizeConst = 12
               )
        ]
        public fixed byte Bytes[12];

        [System.Runtime.InteropServices.FieldOffset(0)]
        public uint On_Off;

        [System.Runtime.InteropServices.FieldOffset(4)]
        public uint KeepaLiveTime;

        [System.Runtime.InteropServices.FieldOffset(8)]
        public uint KeepaLiveInterval;
    }

  线程被打爆:

class Chaos_Threads : IChaosEvent
    {
        CancellationTokenSource cts;
        List tasks = new List();

        public Chaos_Threads()
        {
            cts = new CancellationTokenSource();
        }

        public void Inject(Dictionary<string, string> context)
        {
            try
            {
                var count = context["Threads"];
                int c;
                if (Int32.TryParse(count, out c))
                {
                    //Parallel.For(0, c, new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, (i) =>
                    for (int i = 0; i < c; i++)
                    {
                        var task = new Task(() =>
                        {
                            for (int j = 0; j < 120; j++)
                            {
                                if (cts.IsCancellationRequested)
                                    return;

                                Thread.Sleep(10 * 1000);

                            }

                        }, cts.Token);

                        task.Start();
                        tasks.Add(task);
                    }
                }
            }
            catch { }
        }

        public void Stop()
        {
            cts.Cancel();
            if (tasks != null)
            {
                foreach (var task in tasks)
                {
                    try
                    {
                        task.Dispose();
                    }
                    catch { }
                }
            }
        }
    }

  服务调用异常:

  class Chaos_ServiceException : IChaosEvent
    {
        bool isStop = false;

        public Chaos_ServiceException()
        {

        }

        public void Inject(Dictionary<string, string> context)
        {
            if (isStop == false)
                throw new Exception("Chaos_ServiceException");
        }

        public void Stop()
        {
            isStop = true;
        }
    }

  服务调用超时:

 class Chaos_ServiceTimeout : IChaosEvent
    {
        CancellationTokenSource cts;

        bool isStop = false;

        public Chaos_ServiceTimeout()
        {
            cts = new CancellationTokenSource();
        }

        public void Inject(Dictionary<string, string> context)
        {
            if (isStop == false)
                Task.Delay(10 * 1000, cts.Token).Wait();
        }

        public void Stop()
        {
            cts.Cancel();
            isStop = true;
        }
    }

二、设计一个统一的混沌事件注入器,支持各类混沌事件注入,支持混沌事件的热更新和取消

  1. ChaosEventInjecter

     支持混沌事件接口实现的创建、混沌事件注入(全局注入一次,每次调用都注入)、混沌事件取消(停止)

     混沌事件接口实现的创建

private IChaosEvent GetOrCreateChaosEvent(ChaosEventType chaosEventType)
        {
            if (!eventDic.ContainsKey(chaosEventType))
            {
                lock (syncObj)
                {
                    if (!eventDic.ContainsKey(chaosEventType))
                    {
                        IChaosEvent chaosEvent = null;
                        switch (chaosEventType)
                        {
                            case ChaosEventType.CPU75:
                                chaosEvent = new Chaos_HighCPU75();
                                break;
                            case ChaosEventType.CPU50:
                                chaosEvent = new Chaos_HighCPU50();
                                break;
                            case ChaosEventType.CPU25:
                                chaosEvent = new Chaos_HighCPU25();
                                break;
                            case ChaosEventType.Memory:
                                chaosEvent = new Chaos_Memory();
                                break;
                            case ChaosEventType.Threads:
                                chaosEvent = new Chaos_Threads();
                                break;
                            case ChaosEventType.ServiceException:
                                chaosEvent = new Chaos_ServiceException();
                                break;
                            case ChaosEventType.ServiceTimeout:
                                chaosEvent = new Chaos_ServiceTimeout();
                                break;
                            case ChaosEventType.Ports:
                                chaosEvent = new Chaos_Ports();
                                break;
                            default:
                                break;
                        }

                        if (chaosEvent != null)
                        {
                            eventDic.TryAdd(chaosEventType, chaosEvent);

                            return chaosEvent;
                        }
                    }
                }
            }

            return eventDic[chaosEventType];
        }

    混沌事件注入(全局注入一次,每次调用都注入)

 1 private static object syncObj = new object();
 2 private static object eventObj = new object();
 3 
 4 private static ChaosEventInjecter instance;
 5 private ConcurrentDictionary eventDic;
 6 
 7 private ConcurrentDictionary triggeredEvent;
 8 
 9 private ChaosEventInjecter()
10 {
11             eventDic = new ConcurrentDictionary();
12             triggeredEvent = new ConcurrentDictionary();
13 }
 public void SingletonInject(ChaosEventType chaosEventType, Dictionary<string, string> context = null)
        {
            if (!triggeredEvent.ContainsKey(chaosEventType))
            {
                lock (eventObj)
                {
                    if (!triggeredEvent.ContainsKey(chaosEventType))
                    {
                        var chaosEvent = GetOrCreateChaosEvent(chaosEventType);
                        if (chaosEvent == null) return;

                        chaosEvent.Inject(context);
                        triggeredEvent.TryAdd(chaosEventType, chaosEventType);
                    }
                }
            }
        }

  按服务每次调用都注入

public void ServiceInject(List<string> serviceId)
        {
            ChaosEventManager.GetIntance().StopInject = StopInject;
            if (ChaosEventManager.GetIntance().IsEmpty())
            {
                StopInject();
            }

            foreach (var service in serviceId)
            {
                var chaosEvent = ChaosEventManager.GetIntance().GetChaosEvent(service);
                if (chaosEvent != null)
                {
                    switch (chaosEvent.ChaosEventType)
                    {
                        case ChaosEventType.ServiceException:
                        case ChaosEventType.ServiceTimeout:
                            Inject(chaosEvent.ChaosEventType, chaosEvent.ChaosValue);
                            break;
                        default:
                            SingletonInject(chaosEvent.ChaosEventType, chaosEvent.ChaosValue);
                            break;
                    }
                }
            }
        }

  停止混沌注入

 public void StopInject()
        {
            if (triggeredEvent == null && triggeredEvent.Count == 0) return;

            foreach (var chaosEventType in triggeredEvent)
            {
                var chaosEvent = GetOrCreateChaosEvent(chaosEventType.Key);
                if (chaosEvent == null) return;

                chaosEvent.Stop();
            }

            triggeredEvent = new ConcurrentDictionary();
        }

  完整的ChaosEventInjecter代码:

 /// 
    /// 混沌事件注入器
    /// 
    public class ChaosEventInjecter
    {
        private static object syncObj = new object();

        private static object eventObj = new object();

        private static ChaosEventInjecter instance;

        private ConcurrentDictionary eventDic;

        private ConcurrentDictionary triggeredEvent;

        private ChaosEventInjecter()
        {
            eventDic = new ConcurrentDictionary();
            triggeredEvent = new ConcurrentDictionary();
        }

        public static ChaosEventInjecter GetIntance()
        {
            if (instance == null)
            {
                lock (syncObj)
                {
                    if (instance == null)
                    {
                        instance = new ChaosEventInjecter();
                    }
                }
            }

            return instance;
        }

        public void SingletonInject()
        {
            var eventType = System.Configuration.ConfigurationManager.AppSettings["Chaos.Event"];
            if (eventType != null)
            {
                ChaosEventType chaosEvent = (ChaosEventType)Enum.Parse(typeof(ChaosEventType), eventType.ToString());

                SingletonInject(chaosEvent);
            }
        }

        public void ServiceInject(List<string> serviceId)
        {
            ChaosEventManager.GetIntance().StopInject = StopInject;
            if (ChaosEventManager.GetIntance().IsEmpty())
            {
                StopInject();
            }

            foreach (var service in serviceId)
            {
                var chaosEvent = ChaosEventManager.GetIntance().GetChaosEvent(service);
                if (chaosEvent != null)
                {
                    switch (chaosEvent.ChaosEventType)
                    {
                        case ChaosEventType.ServiceException:
                        case ChaosEventType.ServiceTimeout:
                            Inject(chaosEvent.ChaosEventType, chaosEvent.ChaosValue);
                            break;
                        default:
                            SingletonInject(chaosEvent.ChaosEventType, chaosEvent.ChaosValue);
                            break;
                    }
                }
            }
        }

        public void SingletonInject(ChaosEventType chaosEventType, Dictionary<string, string> context = null)
        {
            if (!triggeredEvent.ContainsKey(chaosEventType))
            {
                lock (eventObj)
                {
                    if (!triggeredEvent.ContainsKey(chaosEventType))
                    {
                        var chaosEvent = GetOrCreateChaosEvent(chaosEventType);
                        if (chaosEvent == null) return;

                        chaosEvent.Inject(context);
                        triggeredEvent.TryAdd(chaosEventType, chaosEventType);
                    }
                }
            }
        }

        public void StopInject()
        {
            if (triggeredEvent == null && triggeredEvent.Count == 0) return;

            foreach (var chaosEventType in triggeredEvent)
            {
                var chaosEvent = GetOrCreateChaosEvent(chaosEventType.Key);
                if (chaosEvent == null) return;

                chaosEvent.Stop();
            }

            triggeredEvent = new ConcurrentDictionary();
        }

        public void Inject(ChaosEventType chaosEventType, Dictionary<string, string> context = null)
        {
            var chaosEvent = GetOrCreateChaosEvent(chaosEventType);
            if (chaosEvent == null) return;

            chaosEvent.Inject(context);
        }

        private IChaosEvent GetOrCreateChaosEvent(ChaosEventType chaosEventType)
        {
            if (!eventDic.ContainsKey(chaosEventType))
            {
                lock (syncObj)
                {
                    if (!eventDic.ContainsKey(chaosEventType))
                    {
                        IChaosEvent chaosEvent = null;
                        switch (chaosEventType)
                        {
                            case ChaosEventType.CPU75:
                                chaosEvent = new Chaos_HighCPU75();
                                break;
                            case ChaosEventType.CPU50:
                                chaosEvent = new Chaos_HighCPU50();
                                break;
                            case ChaosEventType.CPU25:
                                chaosEvent = new Chaos_HighCPU25();
                                break;
                            case ChaosEventType.Memory:
                                chaosEvent = new Chaos_Memory();
                                break;
                            case ChaosEventType.Threads:
                                chaosEvent = new Chaos_Threads();
                                break;
                            case ChaosEventType.ServiceException:
                                chaosEvent = new Chaos_ServiceException();
                                break;
                            case ChaosEventType.ServiceTimeout:
                                chaosEvent = new Chaos_ServiceTimeout();
                                break;
                            case ChaosEventType.Ports:
                                chaosEvent = new Chaos_Ports();
                                break;
                            default:
                                break;
                        }

                        if (chaosEvent != null)
                        {
                            eventDic.TryAdd(chaosEventType, chaosEvent);

                            return chaosEvent;
                        }
                    }
                }
            }

            return eventDic[chaosEventType];
        }
    }

  2. ChaosEventManager

    混沌事件管理类,负责从Redis中实时获取每个服务配置的混沌事件,支持混沌事件的定时更新10s:

 class ChaosEventManager
    {
        private static object syncObj = new object();

        private static ChaosEventManager instance;

        private ConcurrentDictionary<string, ChaosEvent> eventDic;

        CacheService service = CacheService.GetInstance("DefaultPool");

        public Action StopInject { get; set; }

        private ChaosEventManager()
        {
            eventDic = new ConcurrentDictionary<string, ChaosEvent>();
            GetAllChaosEvents();

            StartUpdateTask();
        }

        private void StartUpdateTask()
        {
            var task = new Task(() =>
            {
                while (true)
                {
                    Thread.Sleep(10000);
                    GetAllChaosEvents();
                }
            }, TaskCreationOptions.LongRunning);

            task.ContinueWith((t) =>
            {
                if (t.IsFaulted)
                    StartUpdateTask();
            });

            task.Start();
        }

        public static ChaosEventManager GetIntance()
        {
            if (instance == null)
            {
                lock (syncObj)
                {
                    if (instance == null)
                    {
                        instance = new ChaosEventManager();
                    }
                }
            }

            return instance;
        }

        public ChaosEvent GetChaosEvent(string serviceId)
        {
            if (eventDic.ContainsKey(serviceId))
                return eventDic[serviceId];

            else return null;
        }

        public bool IsEmpty()
        {
            return eventDic == null || eventDic.Count == 0;
        }

        private void GetAllChaosEvents()
        {
            var newEventDic = new ConcurrentDictionary<string, ChaosEvent>();
            using (var client = service.GetClient())
            {

                List<string> keys = client.GetHashKeys("ChaosEvents");
                if (keys != null)
                    keys.ForEach(x => newEventDic.TryAdd(x, client.GetValueFromHash("ChaosEvents", x)));
            }

            foreach (var item in newEventDic)
            {
                eventDic[item.Key] = item.Value;
            }

            if (newEventDic.Count == 0)
                if (StopInject != null)
                    StopInject();
        }
    }

三、在HSF、API网关、中间件SDK层面依赖注入混沌事件注入器

  在HSF服务调用时增加混沌实验事件AOP注入

  API网关、中间件SDK类似的方法进行注入。

 同时我们设计了一个混沌事件注入工具:支持混沌事件实时注入、取消、仿真模拟执行:

  分享:仿真模拟执行的效果,例如CPU25%使用率:

 以上工具和设计思路,分享给大家。

周国庆

2019/3/30