基于文本相似度算法——简单哈希算法
文本相似度一般用于处理告警或日志聚类等文本类数据
from simhash import Simhash def simhash_similarity(text1, text2): """ :param text1: 文本1 :param text2: 文本2 :return: 返回两篇文章的相似度 """ aa_simhash = Simhash(text1) bb_simhash = Simhash(text2) max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value)))) # 汉明距离 distince = aa_simhash.distance(bb_simhash) similar = 1 - distince / max_hashbit return similar alert01 = "2021-12-19 15:00:00 level=3 host=user-ec2-test app=SSP awsec2-i-0174e8e87420f0f8c-GreaterThanOrEqualToThreshold-CPUUtilization CPU使用率 > 90%" alert02 = "2021-12-19 15:01:00 level=2 host=user-ec2-prod app=K8s awsec2-i-0174e8e87420f0f8c-GreaterThanOrEqualToThreshold-MEMUtilization 内存使用率 > 80%" if __name__ == '__main__': print(simhash_similarity(alert01, alert02))
验证结果: