近期又接触到pandas, 简直相见恨晚
import pandas as pd
df1 = pd.read_excel("aa.xlsx")
df2 = pd.read_excel("bb.xlsx")
up1 = df1.loc[df1['regulated'] == "up"] # 筛选出regulated列为up的行
up1.dropna(subset=['gene'], inplace=True) # 删除gene为空的行
up2 = df2.loc[df2['regulated'] == "up"]
up2.dropna(subset=['gene'], inplace=True)
down1 = df1.loc[df1['regulated'] == "down"]
down1.dropna(subset=['gene'], inplace=True)
down2 = df2.loc[df2['regulated'] == "down"]
down2.dropna(subset=['gene'], inplace=True)
up12 = up1.loc[up1['gene'].isin(up2['gene'])].drop_duplicates(['gene']).sort_values(by='gene') # 去重排序
up12.to_excel("upa.xlsx", index=False)
up21 = up2.loc[up2['gene'].isin(up1['gene'])].drop_duplicates(['gene']).sort_values(by='gene')
up21.to_excel("upb.xlsx", index=False)
down12 = down1.loc[down1['gene'].isin(down2['gene'])].drop_duplicates(['gene']).sort_values(by='gene')
down12.to_excel("downa.xlsx", index=False)
down21 = down2.loc[down2['gene'].isin(down1['gene'])].drop_duplicates(['gene']).sort_values(by='gene')
down21.to_excel("downb.xlsx", index=False)
简直神器~
额, python做这个真心效果大赞! aa.txt 和 bb.txt 中间有一列内容有交集, 需要把这个交集和差集提出来
with open("aa.txt") as f1, open("bb.txt") as f2:
a_dict = {}
for line in f1.readlines():
_lst = line.strip().split("@")
if _lst[1] and _lst[1] != "gene":
a_dict[_lst[1]] = line
b_dict = {}
for line in f2.readlines():
_lst = line.strip().split("@")
if _lst[1] and _lst[1] != "gene":
b_dict[_lst[1]] = line
new_dict = dict((i, a_dict[i]) for i in a_dict if i in b_dict)
with open("12.csv", "w") as fh:
fh.write(
"\tgene\t#ID\tT01_Count\tT02_Count\tT01_FPKM\tT02_FPKM\tFDR\tlog2FC\tregulated\tCOG_class\tCOG_class_annotation\tGO_annotation\tKEGG_annotation\tSwissprot_annotation\tnr_annotation\n")
for key in new_dict:
fh.write(new_dict[key].replace("@", "\t"))
new_dict = dict((i, a_dict[i]) for i in a_dict if i not in b_dict)
with open("12_diff.csv", "w") as fh:
fh.write(
"\tgene\t#ID\tT01_Count\tT02_Count\tT01_FPKM\tT02_FPKM\tFDR\tlog2FC\tregulated\tCOG_class\tCOG_class_annotation\tGO_annotation\tKEGG_annotation\tSwissprot_annotation\tnr_annotation\n")
for key in new_dict:
fh.write(new_dict[key].replace("@", "\t"))
new_dict = dict((i, b_dict[i]) for i in a_dict if i in b_dict)
with open("13.csv", "w") as fh:
fh.write(
"\tgene\t#ID\tT01_Count\tT03_Count\tT01_FPKM\tT03_FPKM\tFDR\tlog2FC\tregulated\tCOG_class\tCOG_class_annotation\tGO_annotation\tKEGG_annotation\tSwissprot_annotation\tnr_annotation\n")
for key in new_dict:
fh.write(new_dict[key].replace("@", "\t"))
new_dict = dict((i, b_dict[i]) for i in b_dict if i not in a_dict)
with open("13_diff.csv", "w") as fh:
fh.write(
"\tgene\t#ID\tT01_Count\tT03_Count\tT01_FPKM\tT03_FPKM\tFDR\tlog2FC\tregulated\tCOG_class\tCOG_class_annotation\tGO_annotation\tKEGG_annotation\tSwissprot_annotation\tnr_annotation\n")
for key in new_dict:
fh.write(new_dict[key].replace("@", "\t"))
基本上就是先解析到需要比较的这一列, 已此列值为key, 每整行为value, 所以这两个文件可以分别建两个字典, 然后取字典key的交集, 输出value写入结果文件~