首页 > 技术文章 > 大数据统计脚本, 分城市订单统计

realzjx 2016-08-12 11:57 原文

 

date_parameter <- "2016-08-01"
start_date<-as.Date(date_parameter)
dayCount_parameter = 1
array = strsplit(as.character(start_date),"-")
year = as.character(array[1])
month=as.character(array[2])


hiveContext <- sparkRHive.init(sc)
sql(hiveContext, "use honeycomb_bh_db")

if(dayCount_parameter>0){
date_parameter <-as.character(start_date)
#通过hiveSql 获得想要的并集集合并且缓存下来 sql date_add
##程序执行阶段1: 数据准备。。。。。
acquired_users_sql <-"select presentee_mobile,recommend_mobile, shareid from sc_t_acquire_record where sc_t_acquire_record.year=2016 and sc_t_acquire_record.month=08 and to_date(ct_time)='STARTDATE' and shareid is not null"
all_order_sql <- "select distinct passenger_phone, (create_time) order_time,city_id from sc_t_order_all_info As a where a.year=2016 and a.month=08 and to_date(a.create_time)>='STARTDATE' and to_date(a.create_time)<=date_add(date('STARTDATE'),7) and product_id=210"

# user_agent, app
share_id_city_id_sql <-"select b.* from (select distinct(receivePhone), min(substr(share_id, 11)) time from sc_analyze_acquire_event As a where a.year=2016 and a.month=08 and to_date(create_time)='STARTDATE' and (event_name like '%2103%' or event_name like '%2102%') group by receivePhone) As a inner join (select *, substr(share_id, 11) time2 from sc_analyze_acquire_event) as b on a.receivePhone=b.receivePhone and a.time=b.time2"
share_id_city_id_sql <- gsub(pattern='STARTDATE', replacement=date_parameter, share_id_city_id_sql)

acquired_users_sql<-gsub(pattern='STARTDATE', replacement=date_parameter, acquired_users_sql)
all_order_sql<-gsub(pattern='STARTDATE', replacement=date_parameter, all_order_sql)

share_id_city_id <-sql(hiveContext,share_id_city_id_sql);

#当天领券绑定的用户集合
acquired_users <-sql(hiveContext,acquired_users_sql)
acquired_users<-distinct(acquired_users)
cache(acquired_users)


#7日内的全订单集合
all_orders <-sql(hiveContext,all_order_sql)

acquired_users_with_orders_sql = paste("select * from (", acquired_users_sql,") As acquire inner join (",all_order_sql, ") As orders on acquire.presentee_mobile = orders.passenger_phone where orders.passenger_phone is not null and acquire.presentee_mobile is not null",sep="")
acquired_users_with_orders <-sql(hiveContext,acquired_users_with_orders_sql)
cache(acquired_users_with_orders)

cache(share_id_city_id)
acquired_users <- join(acquired_users,share_id_city_id, share_id_city_id$receivePhone==acquired_users$presentee_mobile, "inner")
acquired_users$receivePhone=NULL
#acquired_users<-filter(acquired_users, "shareid is not null")
acquired_users$acquire_city_id = acquired_users$city_id
acquired_users$city_id =NULL
acquired_users$recommendPhone=NULL
acquired_users$receivePhone=NULL


group1 <- count(group_by(acquired_users, "acquire_city_id"))
acquired_users_with_orders = distinct(acquired_users_with_orders)
#######去掉, 不在一个同一个城市领券打车的人数。 。。。对异地领券,异地打车, 在分城市统计的时候忽略不计。
acquired_users_with_orders$passenger_phone=NULL
acquired_users_with_orders$forJoin = acquired_users_with_orders$city_id
acquired_users_with_orders$city_id = NULL
acquired_users_with_orders<-join(acquired_users_with_orders,share_id_city_id, acquired_users_with_orders$shareid==share_id_city_id$share_id & acquired_users_with_orders$presentee_mobile == share_id_city_id$receivePhone & share_id_city_id$city_id==acquired_users_with_orders$forJoin & share_id_city_id$recommendPhone==acquired_users_with_orders$recommend_mobile, "inner")
acquired_users_with_orders$forJoin=NULL

group2 <-count(group_by(acquired_users_with_orders, "city_id"))
group2$order_count= group2$count
group2$count=NULL
group2$order_city_id = group2$city_id
group2$city_id=NULL
group3 <- join(group1,group2, group1$acquire_city_id==group2$order_city_id, "inner")
group3$ratio <- group3$order_count/group3$count
cache(group3)
cityName <- sql(hiveContext, "select * from sc_mis_city")
city_conv_rank = join(group3, cityName, group3$order_city_id==cityName$id, "inner")
city_conv_rank$id=NULL
city_conv_rank<-orderBy(city_conv_rank, -city_conv_rank$ratio)
city_conv_rank$pid=NULL
city_conv_rank$coupon_count=city_conv_rank$count
city_conv_rank$count=NULL
city_conv_rank$first_order_count=city_conv_rank$order_count
city_conv_rank$order_count=NULL
city_conv_rank$convert_ratio = city_conv_rank$ratio
city_conv_rank$ratio = NULL
}
showDF(city_conv_rank, 1500)

推荐阅读