miRNA sequence analysis

calculate entrophy, top rank miRNA, co-express change

Posted by Xuan on February 28, 2021

miRNA analysis script

Here are scripts for miRNA analysis

  1. Calculate unique read

1.Total Unique read

calculate unique read account from fastq file

------ file process.sh.  -------
#only keep read ID in 1st line of fastq files
awk '{if((NR%2)==1)print $1;else print $0}' $1 > input.fq
#extract ID and sequence info
awk '{if(NR%4!=0)ORS=" ";else ORS="\n"}1' input.fq | awk '{print $1 " " $(NF-2) " " $NF}' > ./id_read.txt;
#calculate uniq read and it count
awk '{print $2}' ./id_read.txt |sort |uniq -c| sort -r -nk1 > ./$2

-------- run command-----
$1 : input fastq file
$2 : output file name

./process.sh Low_glucose.50_clean.fq low.txt

2. co-express

./find_common -e /home/xuanzhan/Data/glucose/expreLevel_cor_high.txt -c /home/xuanzhan/Data/glucose/expreLevel_cor_low.txt
# output : common_list_gc.txt


--- 与纠错后的文件进行合并----
--------- output file: [seq] [Before_cor_count_in_high] [After_cor_count_in_high]  [Before_cor_count_in_low] [After_cor_count_in_low] -----

awk 'NR==FNR{a[$2]=$1;next}{print $1 " " $2 " " a[$1] " " $3}' expreLevel_cor_corhigh.txt /home/xuanzhan/Data/miRNA/code/common_glu_Rankbyhigh.txt > tmp.txt

awk 'NR==FNR{a[$2]=$1;next}{print $0 " " a[$1]}' expreLevel_cor_corlow.txt tmp.txt > common_glucose_bH_aH_bL_aL.txt

3. tissue-specific expression

---------- exist in Low file, not exsit in High file -------------
#合并列,筛选high中不存在的,按表达量排序 (After correction)
awk 'NR==FNR{a[$2]=$1;next}{print $2 " " $1 " " a[$2]}' expreLevel_cor_corhigh.txt expreLevel_cor_corlow.txt | awk '{if($3=="") print $0}' | sort -k2nr > lowspecific_cor.txt
 
#合并列,筛选high中不存在的,按表达量排序 (Before correction)
awk 'NR==FNR{a[$2]=$1;next}{print $2 " " $1 " " a[$2]}' expreLevel_cor_high.txt expreLevel_cor_low.txt | awk '{if($3=="") print $0}' | sort -k2nr > lowspecific.txt

----------- exist in High file, not exsit in Low file -------------

awk 'NR==FNR{a[$2]=$1;next}{print $2 " " $1 " " a[$2]}' expreLevel_cor_corlow.txt expreLevel_cor_corhigh.txt  | awk '{if($3=="") print $0}' | sort -k2nr > highspecific_cor.txt

awk 'NR==FNR{a[$2]=$1;next}{print $2 " " $1 " " a[$2]}' expreLevel_cor_low.txt expreLevel_cor_high.txt | awk '{if($3=="") print $0}' | sort -k2nr > highspecific.txt
awk 'NR==FNR{a[$1]=$2;next}{print FNR " " $0 " " a[$1]}' highspecific_cor.txt highspecific.txt | sort -k4nr > high_only_BF.txt

----- output file: [Before_rank] [seq] [B_account] [After_correction_account] -----


awk 'NR==FNR{a[$1]=$2;next}{print FNR " " $0 " " a[$1]}' lowspecific_cor.txt lowspecific.txt | sort -k4nr > low_only_BF.txt


---- 均与未纠错的文件进行比较----

---------- exist in Low file, not exsit in High file -------------
#合并列,筛选high中不存在的,按表达量排序 (After correction)
awk 'NR==FNR{a[$2]=$1;next}{print $2 " " $1 " " a[$2]}' expreLevel_cor_high.txt expreLevel_cor_corlow.txt | awk '{if($3=="") print $0}' | sort -k2nr > N_lowspecific_cor.txt
 
#合并列,筛选high中不存在的,按表达量排序 (Before correction)
awk 'NR==FNR{a[$2]=$1;next}{print $2 " " $1 " " a[$2]}' expreLevel_cor_high.txt expreLevel_cor_low.txt | awk '{if($3=="") print $0}' | sort -k2nr > N_lowspecific.txt

awk 'NR==FNR{a[$1]=$2;next}{print FNR " " $0 " " a[$1]}' N_lowspecific_cor.txt N_lowspecific.txt | sort -k4nr |awk '{print NR " " $0}'> N_low_only_FB.txt

----- output file: [After_correction_rank][Before_correction_rank] [seq] [B_account] [After_correction_account] -----


---------- exist in High file, not exsit in Low file -------------

awk 'NR==FNR{a[$2]=$1;next}{print $2 " " $1 " " a[$2]}' expreLevel_cor_low.txt expreLevel_cor_corhigh.txt | awk '{if($3=="") print $0}' | sort -k2nr > N_highspecific_cor.txt


awk 'NR==FNR{a[$2]=$1;next}{print $2 " " $1 " " a[$2]}' expreLevel_cor_low.txt expreLevel_cor_high.txt | awk '{if($3=="") print $0}' | sort -k2nr > N_highspecific.txt

awk 'NR==FNR{a[$1]=$2;next}{print FNR " " $0 " " a[$1]}' N_highspecific_cor.txt N_highspecific.txt | sort -k4nr |awk '{print NR " " $0}'> N_high_only_FB.txt