#pl.read_csv??
🔬 Proteomics Quality Control: Beyond the Basics
## Peptide Identification Rate Analysis
'''
path ='/Volumes/dgh-lab/PROTEOMICS_DATA_DUMP/020_2025_DUN_DH/DIA-NN/020_2025_DUN_DH/'
fname = '020_2025_DUN_DH-report.parquet'
tmp = pl.read_parquet(os.path.join(path,fname)).to_pandas()
tmp['RT.Diff']=tmp['RT.Stop']-tmp['RT.Start']
tmp['RT.Bin']=tmp['RT'].astype(int)
rt_dataset = tmp[['Run','RT','RT.Start','RT.Stop','RT.Diff','RT.Bin','Q.Value','Ms1.Area','Precursor.Quantity']]
rt_dataset = rt_dataset.sort_values(['Run','RT'])
rt_dataset['Significant']=(rt_dataset['Q.Value']<0.01).astype(int)
rt_dataset.head()
rt_dataset['Significant'].value_counts()
rt_dataset[rt_dataset['Run']=='020_2025-DUN_DH-GB-2T1-A'].plot(kind='scatter',x='RT',y='RT.Diff')
# evantually track
RT vs Precursor.Quantity / Ms1.Area
RT vs RT.Diff
'''
print(1)
# working in progress
1
prepare_data_MaxQuant
prepare_data_MaxQuant (df)
prepare_data_Spectronaut
prepare_data_Spectronaut (df)
prepare_data_DiaNN
prepare_data_DiaNN (df)
*Process DIA-NN output data to prepare it for plotting retention time bin analysis.
This function takes DIA-NN report output and: 1. Creates retention time bins from the RT column 2. Groups data by Run and RT bin 3. Counts identified and non-identified peptides in each bin 4. Calculates the identification ratio*
plot_proteomics_run
plot_proteomics_run (df, run_name=None, figsize=(12, 6), identified_color='blue', notidentified_color='red', alpha=1.0, add_labels=True, use_different_colors=False)
Plot RT_bin vs Identified and RT_bin vs NotIdentified for runs.
Example
='/Volumes/dgh-lab/PROTEOMICS_DATA_DUMP/020_2025_DUN_DH/DIA-NN/020_2025_DUN_DH/'
path = '020_2025_DUN_DH-report.parquet'
fname = pl.read_parquet(os.path.join(path,fname))
tmp = prepare_data_DiaNN(tmp)
count_data=count_data.to_pandas()
count_data
count_data.head() plot_proteomics_run(count_data)