【模型】Qwen2-VL 服务端UI

news2025/12/17 18:45:48

1. 前言

最近在测试VLM模型，发现官方的网页demo，代码中视频与图片分辨率可能由于高并发设置的很小，导致达不到预期效果，于是自己研究了一下，搞了一个简单的前端部署，自己在服务器部署了下UI界面，方便在本地笔记本进行测试。

2.代码

import streamlit as st
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import os

# 加载模型和处理器 (只加载一次)
@st.cache_resource  # 这个装饰器会缓存模型和处理器
def load_model():
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        "../qwen2_vl/model_7b/", torch_dtype=torch.float16, device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("../qwen2_vl/model_7b/")
    return model, processor


# 加载模型和处理器
model, processor = load_model()


def load_image(image_file):
    img = Image.open(image_file)
    return img


# Function to load and resize image to fixed height
def resize_image_to_height(image, height):
    # Resize image keeping the aspect ratio
    width = int(image.width * height / image.height)
    return image.resize((width, height))


# 处理输入
def process_input(messages):
    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Inference
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    # Clear all intermediate variables and free GPU memory
    del text, image_inputs, video_inputs, inputs, generated_ids, generated_ids_trimmed
    torch.cuda.empty_cache()
    return output_text


# Streamlit UI
st.title("VLM 视觉内容理解")

# 选择文件上传
uploaded_file = st.file_uploader("上传图片或视频", type=["jpg", "jpeg", "png", "mp4"])

# 判断文件是否上传
if uploaded_file is not None:
    # 保存文件到本地
    upload_dir = "uploads"  # 上传文件保存目录
    if not os.path.exists(upload_dir):
        os.makedirs(upload_dir)

    file_path = os.path.join(upload_dir, uploaded_file.name)

    # 保存文件
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    # 判断文件类型
    if uploaded_file.type.startswith("image"):
        # 加载并显示图片
        img = load_image(file_path)
        # 设置固定高度
        fixed_height = 300  # 设置固定高度为300px
        # 调整图片的大小，使高度固定，宽度按比例调整
        img_resized = resize_image_to_height(img, fixed_height)
        st.image(img_resized, use_container_width=False)

        # 输入台词部分
        st.subheader("输入一句提示词")
        user_input = st.text_input("请输入提示词，并回车确认：")

        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": file_path,  # 使用本地保存的文件路径
                        "max_pixels": 1024 * 960
                    },
                    {"type": "text", "text": user_input},
                ],
            }
        ]

    elif uploaded_file.type.startswith("video"):
        # 设置视频固定高度
        fixed_height = 300  # 设置固定高度为300px

        # 显示视频
        st.video(file_path)

        st.subheader("输入一句提示词")
        user_input = st.text_input("请输入提示词，并回车确认：")

        # 通过 Markdown 来调整视频的显示样式
        st.markdown(
            f"""
            <style>
                video {{
                    height: {fixed_height}px;
                    width: auto;
                }}
            </style>
            """,
            unsafe_allow_html=True
        )

        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": file_path,  # 使用本地保存的文件路径
                        "max_pixels": 960 * 480,
                        "fps": 1.0,
                    },
                    {"type": "text", "text": user_input},
                ],
            }
        ]

    # 调用模型进行推理
    result = process_input(messages)

    if result:
        # 使用 st.markdown 和 CSS 来自动换行
        st.markdown("### 模型推理结果：")
        # 将输出格式化为代码块样式，并通过 CSS 实现自动换行
        st.markdown(
            f'<pre style="white-space: pre-wrap; word-wrap: break-word;">{result[0]}</pre>',
            unsafe_allow_html=True
        )
    else:
        st.markdown("### 模型推理结果：无结果。")

    # 推理完成后删除本地文件
    try:
        os.remove(file_path)
    except Exception as e:
        pass