完整模拟题1-数据生成代码

"""
数据生成脚本 - 星点视频数据集
用途：为竞赛训练生成原始脏数据（raw_videos.csv）及清洗后的干净数据（cleaned_videos.csv）
运行方式：python generate_data.py
数据量：默认200,000条，可调整 TOTAL_RECORDS
"""

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# ========== 配置参数 ==========
TOTAL_RECORDS = 200000
SEED = 2025
OUTPUT_RAW = "raw_videos.csv"
OUTPUT_CLEAN = "cleaned_videos.csv"

# 分类列表
CATEGORIES = [
    "tv", "movie", "variety", "anime", "doc",
    "game", "music", "knowledge", "tech", "life",
    "food", "fashion", "sports", "auto", "animal"
]

# 标签池
TAG_POOL = ["热门", "推荐", "精选", "新作", "搞笑", "剧情", "治愈", "燃向", "经典", "连载"]

# ========== 随机状态初始化 ==========
random.seed(SEED)
np.random.seed(SEED)

# ========== 辅助函数 ==========
def generate_video_id(idx):
    """生成格式为 VID-XXXXXX 的唯一ID"""
    return f"VID-{100000 + idx}"

def random_duration(good=True):
    """生成时长字符串，good=False 时可能生成错误格式"""
    minutes = random.randint(0, 180)
    seconds = random.randint(0, 59)
    if good:
        return f"{minutes:02d}:{seconds:02d}"
    # 故意制造错误
    err_type = random.choice(["missing", "bad_format", "bogus"])
    if err_type == "missing":
        return np.nan if random.random() < 0.5 else ""
    elif err_type == "bad_format":
        return f"{random.randint(0, 99)}:{random.randint(0, 99)}"  # 可能秒数超过59
    else:
        return "abcd" if random.random() < 0.5 else "12:345"

def random_tags(allow_dup=True):
    """生成逗号分隔的标签，allow_dup=True 可能产生重复标签"""
    k = random.randint(1, 4)
    chosen = random.choices(TAG_POOL, k=k)
    if allow_dup and k > 1 and random.random() < 0.1:
        # 人工制造重复
        chosen[1] = chosen[0]
    return ",".join(chosen)

def random_int_with_anomaly(mean, std, anomaly_prob=0.02, missing_prob=0.02):
    """生成可能有异常或缺失的整数字段"""
    if random.random() < missing_prob:
        return np.nan
    if random.random() < anomaly_prob:
        # 异常值：负数或0
        return random.choice([0, -random.randint(1, 1000)])
    # 正常值：对数正态分布，确保为正
    val = np.random.lognormal(mean=np.log(max(mean, 1)), sigma=std)
    return int(max(1, val))

# ========== 生成原始数据 ==========
print(f"开始生成 {TOTAL_RECORDS} 条原始数据...")
data = []
for i in range(TOTAL_RECORDS):
    if i % 50000 == 0:
        print(f"  已生成 {i} 条...")
    
    vid = generate_video_id(i)
    cat = random.choice(CATEGORIES)
    
    # 播放量 (均值约 50万，标准差造成变化)
    plays = random_int_with_anomaly(500000, 1.2, anomaly_prob=0.015, missing_prob=0.02)
    # 评论数 (均值约 5000)
    comments = random_int_with_anomaly(5000, 1.5, anomaly_prob=0.015, missing_prob=0.02)
    # 点赞数 (均值约 20万)
    likes = random_int_with_anomaly(200000, 1.3, anomaly_prob=0.015, missing_prob=0.02)
    # 收藏数 (均值约 8000)
    favorites = random_int_with_anomaly(8000, 1.4, anomaly_prob=0.015, missing_prob=0.02)
    
    # 时长 (90%概率生成正常格式，10%可能有问题)
    duration = random_duration(good=(random.random() > 0.1))
    
    # 标签 (很少部分缺失，部分有重复)
    if random.random() < 0.02:
        tags = np.nan
    else:
        tags = random_tags(allow_dup=True)
    
    data.append([vid, cat, plays, comments, likes, favorites, duration, tags])

# 添加一些重复行（约0.5%）
num_dup = int(TOTAL_RECORDS * 0.005)
if num_dup > 0:
    for _ in range(num_dup):
        idx = random.randint(0, TOTAL_RECORDS-1)
        data.append(data[idx][:])  # 追加完全相同的行

# 添加一些极度离群值（播放量极高）
for _ in range(50):
    idx = random.randint(0, len(data)-1)
    data[idx][2] = random.randint(10000000, 50000000)  # 千万级别

df_raw = pd.DataFrame(data, columns=["video_id", "category", "plays", "comments", "likes", "favorites", "duration", "tags"])
print(f"原始数据生成完毕，总计 {len(df_raw)} 行。")

# 保存原始脏数据
df_raw.to_csv(OUTPUT_RAW, index=False)
print(f"已保存原始数据至 {OUTPUT_RAW}")

# ========== 数据清洗（按照题目要求） ==========
print("\n开始清洗数据...")
df = df_raw.copy()

# 1. 将 duration 转换为秒数
def duration_to_seconds(val):
    if pd.isna(val) or not isinstance(val, str):
        return np.nan
    parts = val.strip().split(":")
    if len(parts) != 2:
        return np.nan
    try:
        m, s = int(parts[0]), int(parts[1])
        if s >= 60:  # 秒数不合理
            return np.nan
        return m * 60 + s
    except:
        return np.nan

df["duration_seconds"] = df["duration"].apply(duration_to_seconds)

# 2. 删除任一字段缺失的行（包括 duration_seconds 为NaN）
cols_to_check = ["video_id", "category", "plays", "comments", "likes", "favorites", "tags", "duration_seconds"]
df = df.dropna(subset=cols_to_check)

# 3. 将 plays, comments, likes, favorites 转换为数值类型，无法转换的变为 NaN 然后删除
for col in ["plays", "comments", "likes", "favorites"]:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna(subset=["plays", "comments", "likes", "favorites"])

# 4. 转换为整数
for col in ["plays", "comments", "likes", "favorites"]:
    df[col] = df[col].astype(int)

# 5. 删除负数或零值
for col in ["plays", "comments", "likes", "favorites"]:
    df = df[df[col] > 0]

# 6. 处理 tags 重复标签：去重
def dedup_tags(tag_str):
    if not isinstance(tag_str, str):
        return tag_str
    tags_list = [t.strip() for t in tag_str.split(",") if t.strip()]
    unique_tags = []
    for t in tags_list:
        if t not in unique_tags:
            unique_tags.append(t)
    return ",".join(unique_tags)

df["tags"] = df["tags"].apply(dedup_tags)

# 7. 删除完全重复的行（基于所有列）
df = df.drop_duplicates()

# 8. 删除 plays 大于 99.9分位数的极端离群值
threshold = df["plays"].quantile(0.999)
df = df[df["plays"] <= threshold]

# 9. 按 plays 降序排列，重置索引
df = df.sort_values("plays", ascending=False).reset_index(drop=True)

print(f"清洗完成，剩余 {len(df)} 行。")

# 保存清洗后数据
df.to_csv(OUTPUT_CLEAN, index=False)
print(f"已保存清洗后数据至 {OUTPUT_CLEAN}")

"""
机器学习数据集生成脚本
从清洗后的数据中划分训练集和预测集，并分别生成回归/分类专用文件
"""
from sklearn.model_selection import train_test_split

# 读取清洗后的完整数据
df = pd.read_csv("cleaned_videos.csv")

# 检查必要列是否存在
required = ["video_id", "category", "plays", "comments", "likes", "favorites", "duration_seconds", "tags"]
for col in required:
    if col not in df.columns:
        raise ValueError(f"缺少列: {col}")

# 划分训练集和预测集（80/20 分层抽样，按 category 保持分布）
train_idx, predict_idx = train_test_split(
    df.index, test_size=0.2, random_state=2025, stratify=df["category"]
)
train_df = df.iloc[train_idx].copy()
predict_df = df.iloc[predict_idx].copy()

# ---------- 回归任务 (预测 plays) ----------
# 训练文件：保留所有特征和目标
reg_train = train_df[["video_id", "comments", "likes", "favorites", "duration_seconds", "tags", "plays"]]
reg_train.to_csv("ml_reg_train.csv", index=False)

# 预测文件：去掉目标 plays
reg_predict = predict_df[["video_id", "comments", "likes", "favorites", "duration_seconds", "tags"]]
reg_predict.to_csv("ml_reg_predict.csv", index=False)

# ---------- 分类任务 (预测 category) ----------
# 训练文件：保留特征和目标
clf_train = train_df[["video_id", "comments", "likes", "favorites", "duration_seconds", "tags", "category"]]
clf_train.to_csv("ml_clf_train.csv", index=False)

# 预测文件：去掉目标 category
clf_predict = predict_df[["video_id", "comments", "likes", "favorites", "duration_seconds", "tags"]]
clf_predict.to_csv("ml_clf_predict.csv", index=False)

print("生成完成。")
print(f"回归训练集: {len(reg_train)} 条")
print(f"回归预测集: {len(reg_predict)} 条 (不含 plays)")
print(f"分类训练集: {len(clf_train)} 条")
print(f"分类预测集: {len(clf_predict)} 条 (不含 category)")
print("完成！")
楚怡杯