基于文本相似度算法——简单哈希算法


文本相似度一般用于处理告警或日志聚类等文本类数据

from simhash import Simhash


def simhash_similarity(text1, text2):
    """
    :param text1: 文本1
    :param text2: 文本2
    :return: 返回两篇文章的相似度
    """
    aa_simhash = Simhash(text1)
    bb_simhash = Simhash(text2)
    max_hashbit = max(len(bin(aa_simhash.value)), (len(bin(bb_simhash.value))))
    # 汉明距离
    distince = aa_simhash.distance(bb_simhash)
    similar = 1 - distince / max_hashbit
    return similar
alert01 = "2021-12-19 15:00:00 level=3 host=user-ec2-test app=SSP awsec2-i-0174e8e87420f0f8c-GreaterThanOrEqualToThreshold-CPUUtilization CPU使用率 > 90%"
alert02 = "2021-12-19 15:01:00 level=2 host=user-ec2-prod app=K8s awsec2-i-0174e8e87420f0f8c-GreaterThanOrEqualToThreshold-MEMUtilization 内存使用率 > 80%"

if __name__ == '__main__':
    print(simhash_similarity(alert01, alert02))

验证结果: