AI运维实践

1. 智能告警收敛

痛点:告警风暴

一个故障触发几十上百条告警,运维人员无法快速定位根因。

方案:LLM + 告警聚类


import openai

def consolidate_alerts(alerts, llm_client):
    alert_text = "\n".join([
        f"- {a['name']} @ {a['host']} ({a['time']})"
        for a in alerts
    ])

    response = llm_client.chat.completions.create(
        model="deepseek-chat",
        messages=[{
            "role": "user",
            "content": f'''以下是一组同时发生的告警,请分析是否为同一故障:
{alert_text}

请输出:
1. 是否同源:[是/否]
2. 可能根因:[描述]
3. 建议处理:[步骤]'''
        }]
    )
    return response.choices[0].message.content

# 示例
alerts = [
    {"name": "CPU > 90%", "host": "server-1", "time": "10:01"},
    {"Name": "CPU > 90%", "host": "server-2", "time": "10:01"},
    {"Name": "网络丢包", "host": "server-1", "time": "10:01"},
    {"Name": "MySQL 连接数满", "host": "db-1", "time": "10:02"},
]
result = consolidate_alerts(alerts, client)

2. 根因分析(Root Cause Analysis)


SYSTEM_PROMPT = '''你是一个资深 SRE。收到告警后请:
1. 收集相关指标(CPU/内存/网络/应用日志)
2. 分析时间线,找出最早异常
3. 给出根因假设(Top 3)
4. 提供修复步骤

输出格式:
- 根因假设:[按可能性排序]
- 修复建议:[具体命令/操作]'''

def root_cause_analysis(incident, llm_client):
    prompt = f'''故障信息:{incident['description']}
相关指标:{context}
相关日志:{logs}
K8s 事件:{events}

请分析根因并给出修复建议。'''

    return llm_client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt}
        ]
    )

3. 自然语言数据库查询(Text-to-SQL)


def text_to_sql(question, schema_context, llm_client):
    response = llm_client.chat.completions.create(
        model="deepseek-chat",
        messages=[{
            "role": "user",
            "content": f'''根据数据库表结构,将自然语言问题转为 SQL 查询。
只允许 SELECT 语句,不允许 UPDATE/DELETE/DROP。

表结构:
{schema_context}

问题:{question}

要求:
1. 只输出 SQL 语句
2. 加注释说明查询目的'''
        }]
    )
    sql = response.choices[0].message.content.strip()
    # 安全检查
    forbidden = ['UPDATE', 'DELETE', 'DROP', 'TRUNCATE', 'INSERT']
    if any(kw in sql.upper() for kw in forbidden):
        raise ValueError("禁止的非查询语句")
    return sql

4. 运维知识库问答(RAG)


# 构建运维知识库(K8s / 数据库 / 中间件文档)
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings

loader = DirectoryLoader('/opt/ops-docs', glob="**/*.md")
docs = loader.load()

# Embedding
embeddings = HuggingFaceBgeEmbeddings(model_name='BAAI/bge-large-zh-v1.5')

# 存入向量数据库
from langchain.vectorstores import Milvus
vectorstore = Milvus.from_documents(
    docs, embeddings,
    connection_args={"host": "localhost", "port": 19530}
)

# 检索 + 生成
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

def ask_ops(question, llm_client):
    relevant = retriever.get_relevant_documents(question)
    context = "\n\n".join([doc.page_content for doc in relevant])

    response = llm_client.chat.completions.create(
        model="deepseek-chat",
        messages=[{
            "role": "user",
            "content": f'''基于以下运维文档回答问题:
{context}

问题:{question}'''
        }]
    )
    return response.choices[0].message.content

5. 下一步