spu_hotproduct_analyzer_service.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. """
  2. SPU爆款系数分析服务
  3. 根据《2025-11-23:SPU 爆款系数算法逻辑总结(V2.0)》实现
  4. """
  5. import pandas as pd
  6. import numpy as np
  7. from sklearn.preprocessing import MinMaxScaler
  8. from datetime import datetime
  9. def analyze_spu_hotproduct(df, filename):
  10. """
  11. 分析SPU爆款系数
  12. Args:
  13. df: 清洗后的DataFrame,需包含test2.csv相关字段
  14. filename: 文件名(用于日志)
  15. Returns:
  16. dict: 爆款分析结果
  17. """
  18. print(f"🔥 开始爆款系数分析: {filename}")
  19. # 1. 数据准备与字段映射
  20. df_work = df.copy()
  21. # 字段映射(适配系统现有字段)
  22. required_fields = {
  23. '订单付款时间': '下单时间',
  24. 'SPU': '商品ID',
  25. '购买数量': '数量',
  26. '价格': '价格',
  27. '买家实际支付金额': '商家实收',
  28. '订单状态': '订单状态'
  29. }
  30. # 重命名字段以匹配算法文档
  31. for sys_field, doc_field in required_fields.items():
  32. if sys_field in df_work.columns:
  33. df_work[doc_field] = df_work[sys_field]
  34. # 添加小时字段
  35. df_work['小时'] = pd.to_datetime(df_work['下单时间']).dt.hour
  36. # 添加用户ID(如果没有则用订单号模拟)
  37. if '买家会员名' in df_work.columns:
  38. df_work['用户ID'] = df_work['买家会员名']
  39. elif '订单编号' in df_work.columns:
  40. df_work['用户ID'] = df_work['订单编号']
  41. else:
  42. # 生成模拟用户ID
  43. df_work['用户ID'] = range(len(df_work))
  44. # 2. 筛选交易成功订单
  45. valid = df_work[df_work['订单状态'] == '交易成功'].copy()
  46. print(f"✅ 有效订单数: {len(valid)}")
  47. if len(valid) == 0:
  48. return {
  49. 'success': False,
  50. 'message': '没有交易成功的订单',
  51. 'data': {}
  52. }
  53. # 3. 聚合到SPU级别
  54. spu_agg = valid.groupby('商品ID').agg(
  55. 总销量=('数量', 'sum'),
  56. 总实收=('商家实收', 'sum'),
  57. 标价=('价格', 'mean'),
  58. 首单=('下单时间', 'min'),
  59. 末单=('下单时间', 'max'),
  60. 总UID=('用户ID', 'nunique')
  61. ).reset_index()
  62. # 计算天数跨度
  63. spu_agg['天数跨度'] = (spu_agg['末单'] - spu_agg['首单']).dt.days + 1
  64. # 避免除零
  65. spu_agg['天数跨度'] = spu_agg['天数跨度'].apply(lambda x: max(x, 1))
  66. # 计算单位时间销量
  67. spu_agg['单位时间销量'] = spu_agg['总销量'] / spu_agg['天数跨度']
  68. # 计算实收率(价格接受度)
  69. spu_agg['实收率'] = spu_agg['总实收'] / (spu_agg['标价'] * spu_agg['总销量'])
  70. # 限制实收率范围在0-1之间(可能存在折扣或优惠券导致>1的情况)
  71. spu_agg['实收率'] = spu_agg['实收率'].clip(0, 1)
  72. # 4. 计算退款率
  73. if '退款状态' in df_work.columns:
  74. refund_count = df_work[df_work['退款状态'] == '退款成功'].groupby('商品ID').size()
  75. total_count = df_work.groupby('商品ID').size()
  76. refund_rate = refund_count / total_count
  77. else:
  78. # 如果没有退款状态字段,假设退款率为0
  79. refund_rate = pd.Series(0, index=spu_agg['商品ID'])
  80. spu_agg['退款率'] = spu_agg['商品ID'].map(refund_rate).fillna(0)
  81. spu_agg['稳定性得分'] = 1 - spu_agg['退款率']
  82. # 5. 计算复购率
  83. uid_buy = valid.groupby(['商品ID', '用户ID']).size().reset_index(name='购买次数')
  84. rep_uid = uid_buy[uid_buy['购买次数'] > 1].groupby('商品ID').size()
  85. spu_agg['复购UID数'] = spu_agg['商品ID'].map(rep_uid).fillna(0)
  86. spu_agg['复购率'] = (spu_agg['复购UID数'] / spu_agg['总UID']).fillna(0)
  87. # 6. 计算夜间占比(0-6点)
  88. night_orders = valid[valid['小时'].between(0, 6)]
  89. night_count = night_orders.groupby('商品ID').size()
  90. total_orders = valid.groupby('商品ID').size()
  91. night_ratio = night_count / total_orders
  92. spu_agg['夜间占比'] = spu_agg['商品ID'].map(night_ratio).fillna(0)
  93. # 7. 归一化处理(MinMaxScaler)
  94. metrics_cols = ['单位时间销量', '实收率', '稳定性得分', '复购率', '夜间占比']
  95. scaler = MinMaxScaler()
  96. spu_agg[metrics_cols] = scaler.fit_transform(spu_agg[metrics_cols])
  97. # 8. 计算爆款系数(按权重加权)
  98. weights = {
  99. '单位时间销量': 0.4, # 销售热度 40%
  100. '实收率': 0.3, # 价格接受度 30%
  101. '稳定性得分': 0.1, # 退款稳定性 10%
  102. '复购率': 0.1, # 复购热度 10%
  103. '夜间占比': 0.1 # 夜间爆发力 10%
  104. }
  105. spu_agg['爆款系数'] = (
  106. weights['单位时间销量'] * spu_agg['单位时间销量'] +
  107. weights['实收率'] * spu_agg['实收率'] +
  108. weights['稳定性得分'] * spu_agg['稳定性得分'] +
  109. weights['复购率'] * spu_agg['复购率'] +
  110. weights['夜间占比'] * spu_agg['夜间占比']
  111. )
  112. # 9. 分级标记
  113. def classify_level(score):
  114. if score >= 0.80:
  115. return '超级爆款'
  116. elif score >= 0.60:
  117. return '潜力爆款'
  118. elif score >= 0.40:
  119. return '常规款'
  120. else:
  121. return '清货款'
  122. spu_agg['爆款等级'] = spu_agg['爆款系数'].apply(classify_level)
  123. # 10. 获取商品标题(优先使用商品名称/SPU)
  124. if '商品名称' in valid.columns:
  125. title_map = valid.groupby('商品ID')['商品名称'].first()
  126. elif 'SPU' in valid.columns:
  127. title_map = valid.groupby('商品ID')['SPU'].first()
  128. elif '商品标题' in valid.columns:
  129. title_map = valid.groupby('商品ID')['商品标题'].first()
  130. else:
  131. title_map = {}
  132. spu_agg['商品标题'] = spu_agg['商品ID'].map(title_map).fillna('未知商品')
  133. # 11. 格式化结果
  134. results = {}
  135. for _, row in spu_agg.iterrows():
  136. spu_id = row['商品ID']
  137. results[spu_id] = {
  138. 'spu_id': spu_id,
  139. 'product_title': row['商品标题'],
  140. 'hotproduct_score': round(float(row['爆款系数']), 4),
  141. 'hotproduct_level': row['爆款等级'],
  142. 'metrics': {
  143. 'sales_heat': round(float(row['单位时间销量']), 4), # 销售热度
  144. 'price_acceptance': round(float(row['实收率']), 4), # 价格接受度
  145. 'refund_stability': round(float(row['稳定性得分']), 4), # 退款稳定性
  146. 'repurchase_rate': round(float(row['复购率']), 4), # 复购热度
  147. 'night_burst': round(float(row['夜间占比']), 4) # 夜间爆发力
  148. },
  149. 'raw_data': {
  150. 'total_quantity': int(row['总销量']),
  151. 'total_revenue': round(float(row['总实收']), 2),
  152. 'avg_price': round(float(row['标价']), 2),
  153. 'days_span': int(row['天数跨度']),
  154. 'daily_sales': round(float(row['总销量'] / row['天数跨度']), 2),
  155. 'unique_buyers': int(row['总UID']),
  156. 'repurchase_buyers': int(row['复购UID数']),
  157. 'refund_rate': round(float(row['退款率']), 4)
  158. }
  159. }
  160. # 12. 统计概览
  161. level_dist = spu_agg['爆款等级'].value_counts().to_dict()
  162. summary = {
  163. 'total_spu_count': len(spu_agg),
  164. 'level_distribution': level_dist,
  165. 'avg_score': round(float(spu_agg['爆款系数'].mean()), 4),
  166. 'max_score': round(float(spu_agg['爆款系数'].max()), 4),
  167. 'min_score': round(float(spu_agg['爆款系数'].min()), 4),
  168. 'top_5_spus': spu_agg.nlargest(5, '爆款系数')['商品ID'].tolist()
  169. }
  170. print(f"✅ 爆款分析完成: {len(results)} 个SPU")
  171. print(f"📊 等级分布: {level_dist}")
  172. return {
  173. 'success': True,
  174. 'message': '爆款系数分析完成',
  175. 'data': {
  176. 'spu_results': results,
  177. 'summary': summary,
  178. 'weights': weights
  179. }
  180. }