#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 销售概览分析服务 """ import pandas as pd import numpy as np from datetime import datetime from collections import defaultdict import json def analyze_sale_overview(df, filename): """ 分析销售概览数据 Args: df: 销售数据DataFrame filename: 文件名 Returns: dict: 分析结果 """ try: print(f"开始分析销售概览数据: {filename}") print(f"数据形状: {df.shape}") # 1. 数据预处理 df = preprocess_data(df) # 2. 计算总体指标 summary = calculate_summary(df) # 3. 按品类分析 categories = analyze_categories(df) # 4. 按SKU分析 sku_data = analyze_skus(df) # 5. 趋势分析 trends = analyze_trends(df) # 6. 异常检测 anomalies = detect_anomalies(df) # 7. 构建结果 results = { 'summary': summary, 'categories': categories['categories_data'], 'category_list': categories['category_list'], 'category_skus': categories['category_skus'], 'data': sku_data['sku_data'], 'sku_list': sku_data['sku_list'], 'trends': trends, 'anomalies': anomalies } print("销售概览分析完成") return results except Exception as e: print(f"分析销售概览时出错: {str(e)}") raise def preprocess_data(df): """ 预处理数据 """ # 复制数据以避免修改原始数据 df = df.copy() # 处理日期列 if '订单创建时间' in df.columns: df['订单创建时间'] = pd.to_datetime(df['订单创建时间'], errors='coerce') # 提取日期部分 df['日期'] = df['订单创建时间'].dt.date # 处理数值列 numeric_columns = ['价格', '购买数量', '买家应付货款', '买家实际支付金额', '退款金额'] for col in numeric_columns: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce') # 填充缺失值 df = df.fillna({ '价格': 0, '购买数量': 0, '买家应付货款': 0, '买家实际支付金额': 0, '退款金额': 0 }) # 确保SKU列存在 if '商家编码' in df.columns: df['SKU'] = df['商家编码'] elif '外部系统编号' in df.columns: df['SKU'] = df['外部系统编号'] else: # 如果没有SKU列,使用商品名称作为SKU df['SKU'] = df['商品名称'] # 确保品类列存在 if '品类' not in df.columns: # 简单品类划分:基于商品名称 df['品类'] = df['商品名称'].apply(lambda x: categorize_product(x)) return df def categorize_product(product_name): """ 根据商品名称简单分类 """ product_name = str(product_name).lower() if '腰垫' in product_name or '靠垫' in product_name: return '家居用品' elif '手机' in product_name or '电脑' in product_name: return '电子产品' elif '服装' in product_name or '鞋' in product_name: return '服装鞋包' else: return '其他' def calculate_summary(df): """ 计算总体指标 """ total_quantity = df['购买数量'].sum() total_revenue = df['买家实际支付金额'].sum() avg_price = total_revenue / total_quantity if total_quantity > 0 else 0 # 计算促销力度 df['促销力度'] = 1 - (df['买家实际支付金额'] / df['买家应付货款']) df['促销力度'] = df['促销力度'].fillna(0) avg_promotion = df['促销力度'].mean() * 100 # 转换为百分比 # 计算退款相关指标 refunded_orders = df[df['退款状态'] != '没有申请退款'] total_refund = refunded_orders['退款金额'].sum() refund_rate = len(refunded_orders) / len(df) * 100 if len(df) > 0 else 0 summary = { 'total_orders': len(df), 'total_quantity': int(total_quantity), 'total_revenue': round(total_revenue, 2), 'avg_price': round(avg_price, 2), 'avg_promotion': round(avg_promotion, 2), 'total_refund': round(total_refund, 2), 'refund_rate': round(refund_rate, 2) } return summary def analyze_categories(df): """ 按品类分析 """ categories_data = {} category_list = [] category_skus = defaultdict(list) # 按品类分组 grouped = df.groupby('品类') for category, group in grouped: category_list.append(category) # 计算品类指标 total_quantity = group['购买数量'].sum() total_revenue = group['买家实际支付金额'].sum() avg_price = total_revenue / total_quantity if total_quantity > 0 else 0 # 计算促销力度 group['促销力度'] = 1 - (group['买家实际支付金额'] / group['买家应付货款']) group['促销力度'] = group['促销力度'].fillna(0) avg_promotion = group['促销力度'].mean() * 100 # 计算退款相关指标 refunded_orders = group[group['退款状态'] != '没有申请退款'] total_refund = refunded_orders['退款金额'].sum() refund_rate = len(refunded_orders) / len(group) * 100 if len(group) > 0 else 0 # 趋势数据 date_series = [] quantity_series = [] price_series = [] # 按日期排序 date_grouped = group.groupby('日期') for date in sorted(date_grouped.groups.keys()): date_series.append(str(date)) date_data = date_grouped.get_group(date) quantity_series.append(int(date_data['购买数量'].sum())) avg_date_price = date_data['买家实际支付金额'].sum() / date_data['购买数量'].sum() if date_data['购买数量'].sum() > 0 else 0 price_series.append(round(avg_date_price, 2)) categories_data[category] = { 'total_quantity': int(total_quantity), 'total_revenue': round(total_revenue, 2), 'avg_price': round(avg_price, 2), 'avg_promotion': round(avg_promotion, 2), 'total_refund': round(total_refund, 2), 'refund_rate': round(refund_rate, 2), 'date_series': date_series, 'quantity_series': quantity_series, 'price_series': price_series } # 收集该品类下的SKU skus = group['SKU'].unique().tolist() category_skus[category] = skus return { 'categories_data': categories_data, 'category_list': category_list, 'category_skus': dict(category_skus) } def analyze_skus(df): """ 按SKU分析 """ sku_data = {} sku_list = [] # 按SKU分组 grouped = df.groupby('SKU') for sku, group in grouped: sku_list.append(sku) # 计算SKU指标 total_quantity = group['购买数量'].sum() total_revenue = group['买家实际支付金额'].sum() avg_price = total_revenue / total_quantity if total_quantity > 0 else 0 # 计算促销力度 group['促销力度'] = 1 - (group['买家实际支付金额'] / group['买家应付货款']) group['促销力度'] = group['促销力度'].fillna(0) avg_promotion = group['促销力度'].mean() * 100 # 计算退款相关指标 refunded_orders = group[group['退款状态'] != '没有申请退款'] total_refund = refunded_orders['退款金额'].sum() refund_rate = len(refunded_orders) / len(group) * 100 if len(group) > 0 else 0 # 趋势数据 date_series = [] quantity_series = [] price_series = [] # 按日期排序 date_grouped = group.groupby('日期') for date in sorted(date_grouped.groups.keys()): date_series.append(str(date)) date_data = date_grouped.get_group(date) quantity_series.append(int(date_data['购买数量'].sum())) avg_date_price = date_data['买家实际支付金额'].sum() / date_data['购买数量'].sum() if date_data['购买数量'].sum() > 0 else 0 price_series.append(round(avg_date_price, 2)) sku_data[sku] = { 'total_quantity': int(total_quantity), 'total_revenue': round(total_revenue, 2), 'avg_price': round(avg_price, 2), 'avg_promotion': round(avg_promotion, 2), 'total_refund': round(total_refund, 2), 'refund_rate': round(refund_rate, 2), 'date_series': date_series, 'quantity_series': quantity_series, 'price_series': price_series } return { 'sku_data': sku_data, 'sku_list': sku_list } def analyze_trends(df): """ 分析趋势数据 """ # 按日期分组 date_grouped = df.groupby('日期') date_series = [] quantity_series = [] revenue_series = [] avg_price_series = [] avg_promotion_series = [] for date in sorted(date_grouped.groups.keys()): date_series.append(str(date)) date_data = date_grouped.get_group(date) quantity = date_data['购买数量'].sum() revenue = date_data['买家实际支付金额'].sum() avg_price = revenue / quantity if quantity > 0 else 0 # 计算促销力度 date_data['促销力度'] = 1 - (date_data['买家实际支付金额'] / date_data['买家应付货款']) date_data['促销力度'] = date_data['促销力度'].fillna(0) avg_promotion = date_data['促销力度'].mean() * 100 quantity_series.append(int(quantity)) revenue_series.append(round(revenue, 2)) avg_price_series.append(round(avg_price, 2)) avg_promotion_series.append(round(avg_promotion, 2)) trends = { 'date_series': date_series, 'quantity_series': quantity_series, 'revenue_series': revenue_series, 'avg_price_series': avg_price_series, 'avg_promotion_series': avg_promotion_series } return trends def detect_anomalies(df): """ 检测异常数据 """ anomalies = [] # 按SKU分组检测异常 grouped = df.groupby('SKU') for sku, group in grouped: # 检测销量异常 quantity_mean = group['购买数量'].mean() quantity_std = group['购买数量'].std() if quantity_std > 0: for idx, row in group.iterrows(): quantity = row['购买数量'] z_score = abs((quantity - quantity_mean) / quantity_std) if z_score > 2.5: anomalies.append({ 'date': str(row['日期']) if pd.notna(row['日期']) else '未知', 'sku': sku, 'type': 'quantity_spike' if quantity > quantity_mean else 'quantity_drop', 'reason': f'销量异常,偏离均值 {z_score:.2f} 个标准差', 'value': float(quantity), 'expected': float(quantity_mean), 'deviation': float(z_score) }) # 检测价格异常 price_mean = group['价格'].mean() price_std = group['价格'].std() if price_std > 0: for idx, row in group.iterrows(): price = row['价格'] z_score = abs((price - price_mean) / price_std) if z_score > 2.5: anomalies.append({ 'date': str(row['日期']) if pd.notna(row['日期']) else '未知', 'sku': sku, 'type': 'price_spike' if price > price_mean else 'price_drop', 'reason': f'价格异常,偏离均值 {z_score:.2f} 个标准差', 'value': float(price), 'expected': float(price_mean), 'deviation': float(z_score) }) anomaly_count = len(anomalies) anomaly_rate = (anomaly_count / len(df)) * 100 if len(df) > 0 else 0 return { 'anomaly_count': anomaly_count, 'anomaly_rate': round(anomaly_rate, 2), 'anomalies': anomalies }