创建 table pros_sample,用于存放项目数据,其中只有一条记录:scikit-learn项目信息
创建语句:create table pros_sample as (select * from pro_topwatch where repo_name=’scikit-learn’)
对repo_id 设置索引:repo_index
创建 table issues_sample,用于存放项目对应的Issues数据(2016.9.1-2018.9.1),经过核实无数据缺失,共2158条记录
创建语句:create table issues_sample as (select * from issues_topwatch where repo_id in (select repo_id from pro_sample))
分别对repo_id和issue_id设置索引:repo_index,issue_index
创建 table issue_comments_sample,用于存放issues对应的commnets数据。
create table issue_comments_sample as(select * from issue_comment_topwatch where issue_id in (select issue_id from issues_sample))
共15324条记录,有重复记录,需删除重复记录删除重复记录的方式有些简单粗暴:
首先,create table issue_comments_sample2 as (select DISTINCT* from issue_comments_sample)
然后,删除之前的issue_comments_sample,将issue_comments_sample2更名为:issue_comments_sample
删除后的记录为12129,经核实无误。
对issue_id设置索引:issue_index
创建 table labels_sample,用于存放项目的label数据,经核实数据无误,共19条记录
create table labels_sample as (select * from labels_topwatch where repo_id in (select repo_id from pros_sample))
创建 table prs_sample,用于存放项目所有的PR记录,以此来计算firstpr 数据。用时125.3秒,共6657条记录,经核实数据无误
create table prs_sample as (select p.repo_id as repo_id,prh.pull_request_id as pull_request_id,prh.actor_id as actor_id,
i.issue_id as pr_num,prh.created_at as created_at from pros_sample p, issues i,pull_request_history prh
where p.repo_id=i.repo_id and i.pull_request=1 and i.pull_request_id=prh.pull_request_id and prh.action=’opened’)
为方便关联数据,创建一个新表:issue_comments_sample2,在issue_comments_sample表中增加repo_id 信息,数据无误
create table issue_comments_sample2 as (select ic.*,i.repo_id from issue_comments_sample ic,issues_sample i where ic.issue_id=i.issue_id)
计算用户在某项目中firstPR时间,以此来判断用户在ISSUE做评论时是什么用户角色,用时0.436s,共1793条记录,经核实数据无误
这个项目所有提交过pull request的用户ID都在这张表中
create table firstpr_sample as (select actor_id as user_id, repo_id,issue_id, min(created_at) as mincreated_at from prs_sample GROUP BY actor_id,repo_id)
发现users表数据不全,暂不需要,不做补充。
9.分析issue_comment表中用户身份,如果评论时间早于firstPR时间,则表示评论时还没提交过PR,是项目新手,如果评论时间晚于firstPR时间,则表示评论时已经是贡献者;项目成员的信息通过其他方式计算。
为方便运算,将issue_comment表和firstpr_sample的关联数据放入临时表temp4,共9988条记录,其中有一条是重复记录,暂不清楚产生的原因,也没做处理
create table temp4 as (select ic.user_id,ic.repo_id,ic.issue_id,ic.created_at,f.mincreated_at
from issue_comments_sample2 ic, firstpr_sample f where ic.repo_id=f.repo_id and ic.user_id=f.user_id order by user_id,issue_id)
在main.py程序中加入mark_usertype()函数,issue_comments_sample2 有12129条记录,运行了488s,出了结果,经核实,没有区分member和contributor,另写程序实现
创建member_sample表,用于存放项目全部的审阅信息
create table member_sample as (select DISTINCT c.project_id as repo_id, c.id as commit_id,
c.committer_id as committer_id, c.created_at as created_at from commits c, pros_sample p where c.project_id=p.repo_id)
insert into member_sample select p.repo_id,prh.pull_request_id,prh.actor_id,prh.created_at
from prs_sample pr,pros_sample p, pull_request_history prh where prh.action=’merged’
and prh.pull_request_id=pr.pull_request_id and pr.repo_id=p.repo_id
11.计算member的第一次审阅信息,将信息存放到firstaudit_sample表中,经核实数据无误,共764条记录
create table firstaudit_sample as (select repo_id, committer_id,min(created_at)
as mincreated_at from member_sample group by repo_id,committer_id)
12.将issue_labels信息填入到issues_sample表中
13.获取审阅人信息
create table member_top as (select DISTINCT c.project_id as repo_id, c.id as commit_id, c.committer_id as committer_id, c.created_at as created_at from commits c, pro_top p where c.project_id=p.repo_id)
insert into member_top select p.repo_id,prh.pull_request_id,prh.actor_id,prh.created_at from pr_top_t pr,pro_top p, pull_request_history prh where prh.action=’merged’ and prh.pull_request_id=pr.pull_request_id and pr.repo_id=p.repo_id
create table firstaudit_top as (select repo_id, committer_id,min(created_at) as mincreated_at from member_top group by repo_id,committer_id)
更新issue_events 表格
s1 = time.clock()
csv_read = csv.reader(open(‘E:\mysql-2018-09-01\data\issue_events.csv’, ‘r’))
code = “select issue_id from issues_top”
cursor.execute(code)
repoid_info = cursor.fetchall()
print(“time of extract data:”, time.clock()-s1)
s2 = time.clock()
for row1 in repoid_info:
for row2 in csv_read:
if row1[0] == row2[1] and row2[5] >= '2018-01-01':
code = "insert into issue_events values(%d,%d,%d,'%s','%s','%s')" % \
(int(row2[0]), int(row2[1]), int(row2[2]), row[3], row[4], row[5])
try:
cursor.execute(code)
except Exception as e:
print(e)
select pr.pull_request_id from pr_top_t pr, pull_request_history prh where pr.pull_request_id=prh.pull_request_id and prh.action=’merged’ and pr.repo_id=5219
在pr_top中标记合并的pr信息,
update pr_top set has_merged=1 where pull_request_id in (select pr.pull_request_id from pr_top_t pr, pull_request_history prh where pr.pull_request_id=prh.pull_request_id and prh.action=’merged’)
create table firstmerge_top as (select repo_id, user_id, min(created_time) as minmerged_time from pr_top group by repo_id, user_id
16.
#获得pull request 与 issue 的关联关系
select p.pull_request_id, i.issue_id,i.with_gflabel from issues_top i, issue_events_top ie, commits_top c,pr_top p where i.issue_id=ie.issue_id and ie.action_specified=c.sha and c.id=p.base_commit_id
17.取issue_event近两年信息
#小样本测试,用时**s
select * from issue_events ie, issues_top i where ie.issue_id=i.issue_id and i.repo_id=5219
在pr_top中标注member信息
小样本验证,通过,用时181s
create table test as (select pr.pull_request_id from pr_top pr, firstaudit_top fa where pr.repo_id=fa.repo_id and pr.actor_id=fa.committer_id and pr.repo_id=5219 and pr.created_at>fa.mincreated_at)
select pr.pull_request_id from pr_top pr, firstaudit_top fa where pr.repo_id=fa.repo_id and pr.actor_id=fa.committer_id and pr.repo_id=5219 and pr.created_at>fa.mincreated_at
update pr_top set actor_type=2 where pull_request_id in (select * from test)
#最终代码,用时s,记录
create table tes1 as (select pr.pull_request_id from pr_top pr, firstaudit_top fa where pr.repo_id=fa.repo_id #用时347s
and pr.actor_id=fa.committer_id and pr.created_at>fa.mincreated_at)
update pr_top set actor_type=2 where pull_request_id in (select * from tes1)
在pr_top中标注member信息
update pr_top set actor_type=2 where pull_request_id in (select * from tes1)
创建近两年的开发者信息表,用时5s
create table developer_info as (select distinct repo_id, actor_id from pr_top)
#添加firstpr_time,firstmerg_time,firstaudit_time,flag 字段,添加索引
select d.repo_id,d.actor_id,pr.mincreated_at from developer_info_sample d, firstpr_top pr
where d.repo_id=pr.repo_id and d.actor_id=pr.user_id
#添加firstpr_time 用时367s,影响82558条记录
update developer_info_sample d set d.firstpr_time=(select pr.mincreated_at
from firstpr_top pr where d.repo_id=pr.repo_id and d.actor_id=pr.user_id)
#添加firstaudit_time,用时14s, 影响11139条记录
update developer_info_sample d set d.firstaudit_time=(select au.mincreated_at
from firstaudit_top au where d.repo_id=au.repo_id and d.actor_id=au.committer_id)
#将member数据写入pro_top表中
2018.11.16 16:33记录
要解决的问题:
等待语句update pr_top_t set has_merged=1 where pull_request_id in (select * from firstmerge_temp) 执行完毕,进而创建firstmerge_top表
create table firstmerge_top as (select repo_id, actor_id,min(created_at) as mincreated_at
from pr_top_t GROUP BY repo_id,actor_id)
向developer_info表,添加firstmerge_time 数据
update developer_info_top d set d.firstmerge_time=(select me.mincreated_at
from firstmerge_top me where d.repo_id=me.repo_id and d.actor_id=me.committer_id)
向pr_top 表填入contributor信息
先测试,再执行
update pr_top set actor_type=1 where (repo_id,actor_id) in (select d.repo_id,d.actor_id from pr_top pr, developer_info_top d where d.firstmerge_time is not null
and pr.repo_id=d.repo_id and pr.actor_id=d.actor_id and pr.created_at>d.firstmerge_time)
update pr_top set actor_type=2 where (repo_id,actor_id) in (select d.repo_id,d.actor_id from pr_top pr, developer_info_top d where d.firstaudit_time is not null
and pr.repo_id=d.repo_id and pr.actor_id=d.actor_id and pr.created_at>d.firstaudit_time)
将contributor的数量写入pro_top表中
update pro_top p set n_contributor= (select count(firstmerge_time) from developer_info_top d
where p.repo_id=d.repo_id and d.firstmerge_time is not null group by repo_id)
计算pr_top中actor_type=2 即为member提交的pr 数量,计算pr_top中actor_type=0 和1 的数量,即为外围贡献者提交的Pr 数量
select repo_id,count(actor_type) from pr_top where actor_type=2 group by repo_id
select repo_id,count(actor_type) from pr_top where actor_type=1 or actor_type=0 group by repo_id
将用户类别写入issue_comments_top2表中
各种基于 issue_comments_top2表信息的统计
pr_top_t2存放的是近两年参与贡献的用户的所有信息
create table pr_top_t2 as (select p.* from pr_top_t p, developer_info_top d where p.repo_id=d.repo_id and p.actor_id=d.actor_id)
select * from pull_request_history where pull_request_id=459167
存放被merge的prid信息,稍后运行总样本
create table practionid_temp as (select pr.repo_id,pr.pull_request_id,prh.id,prh.action,prh.created_at from prs_sample pr, pull_request_history prh where pr.pull_request_id=prh.pull_request_id and prh.action=’merged’)
在prs_sample中标记merge信息
update prs_sample set is_merged=1 where pull_request_id in (select pull_request_id from practionid_temp)
计算firstmerge_sample信息
create table firstmerge_sample as (select repo_id,actor_id,min(created_at) as mincreated_at from prs_sample where is_merged=1 group by repo_id,actor_id)
select repo_id,actor_id,min(created_at) from prs_sample where is_merged=1 group by repo_id,actor_id
更新developer_info_sample中firstmerge_time记录
update developer_info_sample d set firstmerge_time= (select fm.mincreated_at from firstmerge_sample fm where d.actor_id=fm.actor_id)
验证数据是否正确,正确
select * from firstmerge_sample where actor_id=3763486
创建get_issueevent_sample 用于从网上获取issue_event数据
create table get_issueevent_sample as (select p.owner_name,p.repo_name,i.issue_num,p.repo_id,i.issue_id from pros_sample p, issues_sample i where p.repo_id=i.repo_id)
select * from get_issueevent_sample
统计issues_sample表中的数据
计算参与讨论的newcomer 数据
select issue_id,count() from issue_comments_sample2 where user_type=1 GROUP BY issue_id
#新人评论数量
update issues_sample i set nNcCmt= (select count() from issue_comments_sample2 ic where user_type=0 and i.issue_id=ic.issue_id GROUP BY issue_id )
#参与评论的新人数量
update issues_sample i set nNcParti= (select count(distinct user_id) from issue_comments_sample2 ic where user_type=0 and i.issue_id=ic.issue_id GROUP BY issue_id )
新人评论数量
update issues_sample i set nMbCmt= (select count(*) from issue_comments_sample2 ic where user_type=2 and i.issue_id=ic.issue_id GROUP BY issue_id )
#参与评论的新人数量
update issues_sample i set nMbParti= (select count(distinct user_id) from issue_comments_sample2 ic where user_type=2 and i.issue_id=ic.issue_id GROUP BY issue_id )
select issue_id, count(*) from issue_comments_sample2 where user_type=1 GROUP BY issue_id
update issues_sample set nNcParti=null where nNcParti is not null
select i.issue_id, i.user_id, i.created_at,d.firstpr_time,d.firstmerge_time,d.firstaudit_time
from issue_comments_sample2 i, developer_info_sample d where i.repo_id=d.repo_id and i.user_id=d.actor_id
工程文件说明
目录说明
环境说明
创建 table pros_sample,用于存放项目数据,其中只有一条记录:scikit-learn项目信息 创建语句:create table pros_sample as (select * from pro_topwatch where repo_name=’scikit-learn’) 对repo_id 设置索引:repo_index
创建 table issues_sample,用于存放项目对应的Issues数据(2016.9.1-2018.9.1),经过核实无数据缺失,共2158条记录 创建语句:create table issues_sample as (select * from issues_topwatch where repo_id in (select repo_id from pro_sample)) 分别对repo_id和issue_id设置索引:repo_index,issue_index
创建 table issue_comments_sample,用于存放issues对应的commnets数据。 create table issue_comments_sample as(select * from issue_comment_topwatch where issue_id in (select issue_id from issues_sample)) 共15324条记录,有重复记录,需删除重复记录删除重复记录的方式有些简单粗暴: 首先,create table issue_comments_sample2 as (select DISTINCT* from issue_comments_sample) 然后,删除之前的issue_comments_sample,将issue_comments_sample2更名为:issue_comments_sample 删除后的记录为12129,经核实无误。 对issue_id设置索引:issue_index
创建 table labels_sample,用于存放项目的label数据,经核实数据无误,共19条记录 create table labels_sample as (select * from labels_topwatch where repo_id in (select repo_id from pros_sample))
通过查询issue_labels,确定存在数据缺失的情况,必须从API上获取数据。 写了main.py,用了time.clock()功能,从api上获取数据,一共是2158条记录,用时3271s,这个需要优化
创建 table prs_sample,用于存放项目所有的PR记录,以此来计算firstpr 数据。用时125.3秒,共6657条记录,经核实数据无误 create table prs_sample as (select p.repo_id as repo_id,prh.pull_request_id as pull_request_id,prh.actor_id as actor_id, i.issue_id as pr_num,prh.created_at as created_at from pros_sample p, issues i,pull_request_history prh where p.repo_id=i.repo_id and i.pull_request=1 and i.pull_request_id=prh.pull_request_id and prh.action=’opened’)
为方便关联数据,创建一个新表:issue_comments_sample2,在issue_comments_sample表中增加repo_id 信息,数据无误 create table issue_comments_sample2 as (select ic.*,i.repo_id from issue_comments_sample ic,issues_sample i where ic.issue_id=i.issue_id)
计算用户在某项目中firstPR时间,以此来判断用户在ISSUE做评论时是什么用户角色,用时0.436s,共1793条记录,经核实数据无误 这个项目所有提交过pull request的用户ID都在这张表中 create table firstpr_sample as (select actor_id as user_id, repo_id,issue_id, min(created_at) as mincreated_at from prs_sample GROUP BY actor_id,repo_id) 发现users表数据不全,暂不需要,不做补充。
9.分析issue_comment表中用户身份,如果评论时间早于firstPR时间,则表示评论时还没提交过PR,是项目新手,如果评论时间晚于firstPR时间,则表示评论时已经是贡献者;项目成员的信息通过其他方式计算。 为方便运算,将issue_comment表和firstpr_sample的关联数据放入临时表temp4,共9988条记录,其中有一条是重复记录,暂不清楚产生的原因,也没做处理 create table temp4 as (select ic.user_id,ic.repo_id,ic.issue_id,ic.created_at,f.mincreated_at from issue_comments_sample2 ic, firstpr_sample f where ic.repo_id=f.repo_id and ic.user_id=f.user_id order by user_id,issue_id) 在main.py程序中加入mark_usertype()函数,issue_comments_sample2 有12129条记录,运行了488s,出了结果,经核实,没有区分member和contributor,另写程序实现
insert into member_sample select p.repo_id,prh.pull_request_id,prh.actor_id,prh.created_at from prs_sample pr,pros_sample p, pull_request_history prh where prh.action=’merged’ and prh.pull_request_id=pr.pull_request_id and pr.repo_id=p.repo_id
11.计算member的第一次审阅信息,将信息存放到firstaudit_sample表中,经核实数据无误,共764条记录 create table firstaudit_sample as (select repo_id, committer_id,min(created_at) as mincreated_at from member_sample group by repo_id,committer_id)
12.将issue_labels信息填入到issues_sample表中
13.获取审阅人信息 create table member_top as (select DISTINCT c.project_id as repo_id, c.id as commit_id, c.committer_id as committer_id, c.created_at as created_at from commits c, pro_top p where c.project_id=p.repo_id)
insert into member_top select p.repo_id,prh.pull_request_id,prh.actor_id,prh.created_at from pr_top_t pr,pro_top p, pull_request_history prh where prh.action=’merged’ and prh.pull_request_id=pr.pull_request_id and pr.repo_id=p.repo_id
create table firstaudit_top as (select repo_id, committer_id,min(created_at) as mincreated_at from member_top group by repo_id,committer_id)
更新issue_events 表格
s1 = time.clock() csv_read = csv.reader(open(‘E:\mysql-2018-09-01\data\issue_events.csv’, ‘r’)) code = “select issue_id from issues_top” cursor.execute(code) repoid_info = cursor.fetchall() print(“time of extract data:”, time.clock()-s1) s2 = time.clock() for row1 in repoid_info: for row2 in csv_read:对pr_top中的记录记录是否被merge ,如果被merge才是contributor, 否则还是newcomer
用于验证,用时70s,共2833条记录
select pr.pull_request_id from pr_top_t pr, pull_request_history prh where pr.pull_request_id=prh.pull_request_id and prh.action=’merged’ and pr.repo_id=5219在pr_top中标记合并的pr信息,
update pr_top set has_merged=1 where pull_request_id in (select pr.pull_request_id from pr_top_t pr, pull_request_history prh where pr.pull_request_id=prh.pull_request_id and prh.action=’merged’)创建firstmerge_top表格,存放用户在项目中的第一次被合并的pr信息,他们就是项目的contributor
create table firstmerge_top as (select repo_id, user_id, min(created_time) as minmerged_time from pr_top group by repo_id, user_id16. #获得pull request 与 issue 的关联关系 select p.pull_request_id, i.issue_id,i.with_gflabel from issues_top i, issue_events_top ie, commits_top c,pr_top p where i.issue_id=ie.issue_id and ie.action_specified=c.sha and c.id=p.base_commit_id
17.取issue_event近两年信息 #小样本测试,用时**s select * from issue_events ie, issues_top i where ie.issue_id=i.issue_id and i.repo_id=5219
在pr_top中标注member信息
小样本验证,通过,用时181s
create table test as (select pr.pull_request_id from pr_top pr, firstaudit_top fa where pr.repo_id=fa.repo_id and pr.actor_id=fa.committer_id and pr.repo_id=5219 and pr.created_at>fa.mincreated_at)
select pr.pull_request_id from pr_top pr, firstaudit_top fa where pr.repo_id=fa.repo_id and pr.actor_id=fa.committer_id and pr.repo_id=5219 and pr.created_at>fa.mincreated_at
update pr_top set actor_type=2 where pull_request_id in (select * from test) #最终代码,用时s,记录 create table tes1 as (select pr.pull_request_id from pr_top pr, firstaudit_top fa where pr.repo_id=fa.repo_id #用时347s and pr.actor_id=fa.committer_id and pr.created_at>fa.mincreated_at) update pr_top set actor_type=2 where pull_request_id in (select * from tes1)
在pr_top中标注member信息
update pr_top set actor_type=2 where pull_request_id in (select * from tes1)
创建近两年的开发者信息表,用时5s
create table developer_info as (select distinct repo_id, actor_id from pr_top) #添加firstpr_time,firstmerg_time,firstaudit_time,flag 字段,添加索引 select d.repo_id,d.actor_id,pr.mincreated_at from developer_info_sample d, firstpr_top pr where d.repo_id=pr.repo_id and d.actor_id=pr.user_id #添加firstpr_time 用时367s,影响82558条记录 update developer_info_sample d set d.firstpr_time=(select pr.mincreated_at from firstpr_top pr where d.repo_id=pr.repo_id and d.actor_id=pr.user_id) #添加firstaudit_time,用时14s, 影响11139条记录 update developer_info_sample d set d.firstaudit_time=(select au.mincreated_at from firstaudit_top au where d.repo_id=au.repo_id and d.actor_id=au.committer_id) #将member数据写入pro_top表中
2018.11.16 16:33记录 要解决的问题:
update pr_top set actor_type=2 where (repo_id,actor_id) in (select d.repo_id,d.actor_id from pr_top pr, developer_info_top d where d.firstaudit_time is not null and pr.repo_id=d.repo_id and pr.actor_id=d.actor_id and pr.created_at>d.firstaudit_time)
将contributor的数量写入pro_top表中 update pro_top p set n_contributor= (select count(firstmerge_time) from developer_info_top d where p.repo_id=d.repo_id and d.firstmerge_time is not null group by repo_id)
计算pr_top中actor_type=2 即为member提交的pr 数量,计算pr_top中actor_type=0 和1 的数量,即为外围贡献者提交的Pr 数量 select repo_id,count(actor_type) from pr_top where actor_type=2 group by repo_id select repo_id,count(actor_type) from pr_top where actor_type=1 or actor_type=0 group by repo_id
将用户类别写入issue_comments_top2表中
pr_top_t2存放的是近两年参与贡献的用户的所有信息
create table pr_top_t2 as (select p.* from pr_top_t p, developer_info_top d where p.repo_id=d.repo_id and p.actor_id=d.actor_id)
select * from pull_request_history where pull_request_id=459167
存放被merge的prid信息,稍后运行总样本
create table practionid_temp as (select pr.repo_id,pr.pull_request_id,prh.id,prh.action,prh.created_at from prs_sample pr, pull_request_history prh where pr.pull_request_id=prh.pull_request_id and prh.action=’merged’)
在prs_sample中标记merge信息
update prs_sample set is_merged=1 where pull_request_id in (select pull_request_id from practionid_temp)
计算firstmerge_sample信息
create table firstmerge_sample as (select repo_id,actor_id,min(created_at) as mincreated_at from prs_sample where is_merged=1 group by repo_id,actor_id)
select repo_id,actor_id,min(created_at) from prs_sample where is_merged=1 group by repo_id,actor_id
更新developer_info_sample中firstmerge_time记录
update developer_info_sample d set firstmerge_time= (select fm.mincreated_at from firstmerge_sample fm where d.actor_id=fm.actor_id)
验证数据是否正确,正确
select * from firstmerge_sample where actor_id=3763486
创建get_issueevent_sample 用于从网上获取issue_event数据
create table get_issueevent_sample as (select p.owner_name,p.repo_name,i.issue_num,p.repo_id,i.issue_id from pros_sample p, issues_sample i where p.repo_id=i.repo_id)
select * from get_issueevent_sample
统计issues_sample表中的数据
计算参与讨论的newcomer 数据
select issue_id,count() from issue_comments_sample2 where user_type=1 GROUP BY issue_id #新人评论数量 update issues_sample i set nNcCmt= (select count() from issue_comments_sample2 ic where user_type=0 and i.issue_id=ic.issue_id GROUP BY issue_id ) #参与评论的新人数量 update issues_sample i set nNcParti= (select count(distinct user_id) from issue_comments_sample2 ic where user_type=0 and i.issue_id=ic.issue_id GROUP BY issue_id )
新人评论数量
update issues_sample i set nMbCmt= (select count(*) from issue_comments_sample2 ic where user_type=2 and i.issue_id=ic.issue_id GROUP BY issue_id ) #参与评论的新人数量 update issues_sample i set nMbParti= (select count(distinct user_id) from issue_comments_sample2 ic where user_type=2 and i.issue_id=ic.issue_id GROUP BY issue_id )
select issue_id, count(*) from issue_comments_sample2 where user_type=1 GROUP BY issue_id
update issues_sample set nNcParti=null where nNcParti is not null
select i.issue_id, i.user_id, i.created_at,d.firstpr_time,d.firstmerge_time,d.firstaudit_time from issue_comments_sample2 i, developer_info_sample d where i.repo_id=d.repo_id and i.user_id=d.actor_id