paper id: zszavvJhpaper author name: feifei chengpaper author name: heng lipaper author name: y w wangpaper author name: martin skitmorepaper author name: p j forsythe
# 查看机构、协作作者、协作机构、常用关键词、发表的会议期刊是否有交集for aid1 in author_data: for aid2 in author_data: if aid1 != aid2: if set(org[aid1]).intersection(set(org[aid2])): print('%s与%s机构重叠:' % (aid1, aid2), set(org[aid1]).intersection(set(org[aid2]))) if set(co_author[aid1]).intersection(set(co_author[aid2])): print('%s与%s协作者重叠:' % (aid1, aid2), set(co_author[aid1]).intersection(set(co_author[aid2]))) if set(co_org[aid1]).intersection(set(co_org[aid2])): print('%s与%s协作机构重叠:' % (aid1, aid2), set(co_org[aid1]).intersection(set(co_org[aid2]))) if set(keywords[aid1]).intersection(set(keywords[aid2])): print('%s与%s关键词重叠:' % (aid1, aid2), set(keywords[aid1]).intersection(set(keywords[aid2]))) if set(venue[aid1]).intersection(set(venue[aid2])): print('%s与%s会议期刊重叠:' % (aid1, aid2), set(venue[aid1]).intersection(set(venue[aid2])))
bXCEdRdj与WSFIcbxu关键词重叠: {'chemical composition'}bXCEdRdj与WSFIcbxu会议期刊重叠: {'Journal of Agricultural and Food Chemistry'}bXCEdRdj与gUSOTB2u会议期刊重叠: {'Journal of home economics'}OaaWMLWs与5Bxjg2tr协作者重叠: {'Shoei Sato'}OaaWMLWs与5Bxjg2tr会议期刊重叠: {'APSIPA'}WSFIcbxu与bXCEdRdj关键词重叠: {'chemical composition'}WSFIcbxu与bXCEdRdj会议期刊重叠: {'Journal of Agricultural and Food Chemistry'}CsE5vsJA与JvJ9opoq协作者重叠: {'Hiroki Sakaji'}gUSOTB2u与bXCEdRdj会议期刊重叠: {'Journal of home economics'}5Bxjg2tr与OaaWMLWs协作者重叠: {'Shoei Sato'}5Bxjg2tr与OaaWMLWs会议期刊重叠: {'APSIPA'}JvJ9opoq与CsE5vsJA协作者重叠: {'Hiroki Sakaji'}
# 统计论文的年份范围min_year = 3000max_year = 0pubs_no_year = 0for each in tqdm(author_pub.items()): try: year = int(each[1]['year']) except: pubs_no_year += 1 print("Paper {} has no year info".format(each[0])) continue if year<1500 or year>2100: pubs_no_year +=1 print("Paper {} has wrong year info: {}".format(each[0], year)) continue if year<min_year: min_year = year if year>max_year: max_year = yearprint(min_year, max_year)print("共计{}份论文没有年份信息".format(pubs_no_year))
for name in train_author.keys()[15:20]: f, ax = plt.subplots(figsize=(8, 6)) plt.plot([len(train_author[name][author])for author in train_author[name].keys()])
# 同名中各个作者发布论文区间count=0for name in train_author.keys()[15:20]: f, ax = plt.subplots(figsize=(8, 6)) authors = train_author[name].keys() for author in authors: years=[] for thesis in train_author[name][author]: if 'year' in train_pub[thesis].keys(): years.append(train_pub[thesis]['year']) plt.plot(np.sort(years)) plt.ylim(1990,2020) if count==3: print(name) count+=1
来源:赛道一,林志豪
无监督聚类DBSCAN(根据合作者和机构TFIDF进行相似度聚类)
for author in validate_data: # print(author) coauther_orgs = [] papers = validate_data[author] if len(papers) == 0: res_dict[author] = [] continue # print(len(papers)) paper_dict = {} for paper in papers: authors = paper['authors'] names = [precessname(paper_author['name']) for paper_author in authors] orgs = [preprocessorg(paper_author['org']) for paper_author in authors if 'org' in paper_author] abstract = paper["abstract"] if 'abstract' in paper else '' coauther_orgs.append(etl(' '.join(names + orgs) + ' '+ abstract)) tfidf = TfidfVectorizer().fit_transform(coauther_orgs) clf = DBSCAN(metric='cosine') s = clf.fit_predict(tfidf) # 每个样本所属的簇 for label, paper in zip(clf.labels_, papers): if str(label) not in paper_dict: paper_dict[str(label)] = [paper['id']] else: paper_dict[str(label)].append(paper['id']) res_dict[author] = list(paper_dict.values()) tfidf = PCA(n_components=2, random_state=0).fit_transform(tfidf.toarray())