1. Prokka注释

conda activate prokka   # 注意在运行之前一定记得切换环境
cp ~/work/a_general_shell/multi_prokka.sh ./
# 此处注意输入文件需更改格式为.fa后缀
nohup ./multi_prokka.sh &

2. Roary泛基因组比较差异基因

运行前提是已经进入到prokka的gff文件中

conda activate aroary
nohup roary -e -n -v -p 32 ./*.gff -f roary_output &
Example-两两比对
nohup roary -e -n -v -p 32 /mnt/sdb1/share/ZHAOWENBO/fanjiyinzu0307/roary_2325_R1/*.gff -f 2325_R1 &          

nohup roary -e -n -v -p 32 /mnt/sdb1/share/ZHAOWENBO/fanjiyinzu0307/roary_R1_R2/*.gff -f R1_R2 &

nohup roary -e -n -v -p 32 /mnt/sdb1/share/ZHAOWENBO/fanjiyinzu0307/roary_R2_R3/*.gff -f R2_R3 &

nohup roary -e -n -v -p 32 /mnt/sdb1/share/ZHAOWENBO/fanjiyinzu0307/roary_R2_R4/*.gff -f R2_R4 &

nohup roary -e -n -v -p 32 /mnt/sdb1/share/ZHAOWENBO/fanjiyinzu0307/roary_R3_R4/*.gff -f R3_R4 &

nohup roary -e -n -v -p 32 /mnt/sdb1/share/ZHAOWENBO/fanjiyinzu0307/roary_2325_R1_R2_R4/*.gff -f 2325_R1_R2_R4 &

3. 三代拼接结果gff文件从各个文件取出来并命名

for i in ` ls |grep "_fly" ` ;do cp $i/assembly.fasta ./$i.fa ;done

4.二三代混合拼接

nohup unicycler -1 ~/work/zhang_likuan/MA120_zhulian_2+3th_20240305/MA120/2_HQData/MA120.SOAP_cor_R1.fq.gz -2 ~/work/zhang_likuan/MA120_zhulian_2+3th_20240305/MA120/2_HQData/MA120.SOAP_cor_R2.fq.gz -l ~/work/zhang_likuan/MA120_zhulian_2+3th_20240305/MA120/nanopore/MA120.fastq.gz -o unicycler20240306 &

# -1 为正向;-2 为反向; -l 输入三代测序结果

5. fasta后缀批量替换为fa

rename s/.fasta/.fa/g ./*.fasta

rename s/.fasta/.fa/g ./*.fasta

6. conda环境查看

conda env list
conda info --envs

conda list # 查看conda环境内置软件

conda list -n mafft  # 可查看mafft conda环境下的软件包有哪些

7. MLST分型(ST型扫描)

nohup mlst ./*.fa > mlst.txt & # 注意mlst储存在abricate环境中记得切换

8. 耐药/毒力/质粒类型基因扫描(abricate)

nohup ./general_abricate.sh & # 与fa文件放于同一目录下

9. SNP进化树分析

conda activate parsnp
nohup parsnp -c -p 16 -n mafft -r ./BJQBH010.fa -d ./*.fa -o parsnp &

10. MAFFT序列比对

conda activate mafft
nohup mafft sequence.fasta > output.fa   

mafft --auto sequence.fa > output.fa
# 此处可以直接使用.fasta整个多条目文件

# 以下粘贴自作者:lkj666
链接:https://www.jianshu.com/p/2b882849ed61
来源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。



情况一:序列小于200条,长度<2000aa/nt(最准确)
mafft --maxiterate 1000 --localpair input > output

情况二:序列小于200条,长度<2000aa/nt,序列长度相似
mafft --maxiterate 1000 --globalpair input > output

情况三:不清楚序列情况
mafft --auto input > output

MUSCLE

  1. 简介

多序列比对在生物信息分析中是常用到的操作,目前多序列比对的软件较多,没有一款十分完美的多序列比对工具。有文献报道:

  1. 多序列比对的目的

输入文件的格式要求:
① 序列为fasta格式文件
② 序列的名称要唯一,不能有空格,且重要信息尽量放在前面
③ 序列名称不要出现中文、@、\等特殊字符

4.1 安装
conda install muscle

4.2 使用
muscle -in file1 -out file2

11. iqtree建树

iqtree -s output.fa -m MFP -T AUTO


iqtree -s output.fa -m MFP -T AUTO -cmax 15 -B 5000 -bnni --prefix samples


nohup iqtree2 -s 328_ref_essC_all_length_AA_mafft.fa -m MFP -T AUTO -B 1000 --prefix samples &

-B 100 也是此处的指令,想准确度更高可以添加
-m MFP 为自动选择比对方法
-T AUTO 为使用最大CPU核心数

nohup iqtree2 -s 204_bidui.fa -m MFP -T AUTO --prefix samples &

12. seqkit批量批量提取多条序列并成单独.fa文件

seqkit split -i sequence.fasta

13. BRIG 质粒比对图

14.Easyfig 环境比对图

15. Artemis 单质粒图

16. mlst.txt结果菌种分析

(abricate) dxd@henau-PowerEdge-R940xa:~/work/zhao_wenbo/DYKp_74_20220627/spades_2th$ 
cat mlst.txt | grep ".fa" | cut -f2 | sort | uniq -c
      1 -
      1 [15:43:21] Found 'any2fasta' => /mnt/sdb1/HAU/dxd/miniconda3/envs/abricate/bin/any2fasta
      1 [15:44:11] Found exact allele match efaecalis.aroE-14
      1 [15:44:11] Found exact allele match efaecalis.gdh-17
      1 [15:44:11] Found exact allele match efaecalis.gki-3
      1 [15:44:11] Found exact allele match efaecalis.gyd-2
      1 [15:44:11] Found exact allele match efaecalis.pstS-1
      1 [15:44:11] Found exact allele match efaecalis.xpt-14
      1 [15:44:11] Found exact allele match efaecalis.yqiL-1
      1 [15:44:11] Found exact allele match efaecium.adk-18
      1 [15:44:11] Found exact allele match efaecium.atpA-99
      1 [15:44:11] Found exact allele match efaecium.gdh-98
      1 [15:44:11] Found exact allele match efaecium.pstS-199
      1 abaumannii_2
     24 ecoli_achtman_4
     48 klebsiella

17. 核对两个文件内部的内容是否一致(用于比对下载的文件是否全面)

cat down.txt genome_list.txt |sort |uniq -c |sort |head 

18. sed -i 's/\r//g' genome_list.txt

这条命令使用sed工具来在genome_list.txt文件中执行替换操作。具体来说,命令的含义如下:

因此,这条命令的作用是在genome_list.txt文件中删除所有的回车符(\r),这通常用于处理Windows格式文本文件在Unix/Linux系统下可能出现的行尾格式问题。

如果您有任何其他问题或需要进一步的解释,请随时告诉我。我随时为您提供帮助。

sed -n l 1.txt 
cat sequences|grep tmexD2 |sed -n l
seqkit grep --pattern-file 1.txt sequences > tmexD2.fa
seqkit grep -h
seqkit grep -p ncbi~~~tmexD2~~~NG_076648.1~~~tmexD multidrug_efflux_RND_transporter_permease_subunit_TMexD2 > tmexD2.fa
seqkit grep -p ncbi~~~tmexD2~~~NG_076648.1~~~tmexD sequences > tmexD2.fa
history | tail

19.建库修改规则

awk -F "_" '/^>/{split($0, a, "_"); printf(">ISfinder~~~%s~~~unkown~~~%s\n", a[1], a[2]); next} {print}' file.fasta > new_file.fasta


awk -F "|" '/^>/{split($0, a, "|"); printf(">ncbi~~~%s~~~%s~~~%s %s\n", a[6], a[3], a[7], a[8]); next} {print}' sequences > sequences1

# ncbi

seqkit stats sequences

20. 提取特异性基因序列

#!/bin/bash
mkdir -p ncbi_2024_exc ; cat general_ab_anno/ncbi_2024/*.fa.txt |grep "tet(A)" |awk '$10<100 || $11<100 {print}' |cat > ncbi_2024_exc/res_sum.txt ;cat ncbi_2024_exc/res_sum.txt |grep -v "#" |awk '{print $1,$6,$2,$3-1,$4}' | sed 's#/#_#g' | cat > ncbi_2024_exc/id.txt ; split -l 1 ncbi_2024_exc/id.txt -d ncbi_2024_exc/roll
for i in ` ls ncbi_2024_exc/roll* |sed 's#ncbi_2024_exc/##g' `
do
        str=$(cat ncbi_2024_exc/$i |awk '{print $1}' |sed 's/.fa//g')
        res=$(cat ncbi_2024_exc/$i |awk '{print $2}')
        mkdir -p ncbi_2024_exc/$str ;cat ncbi_2024_exc/$i |awk -v OFS="\t" '{print $3,$4,$5}' |cat >ncbi_2024_exc/${str}_${res}_id.txt ;cp ncbi_2024_exc/${str}_* ncbi_2024_exc/${str}/ ; seqkit subseq --bed ncbi_2024_exc/${str}/${str}_${res}_id.txt $str.fa -o ncbi_2024_exc/$str/$res.fa
done

# 此处为提取tetA

(base) dxd@henau-PowerEdge-R940xa:~/work/yu_runhao/henan_enter_20240319/sub_seq/tetX4_tiqu$ seqkit stats ./*.fa
file format type num_seqs sum_len min_len avg_len max_len
./1916D18.fa FASTA DNA 3 4,966,215 59,353 1,655,405 4,828,364
./1916D6.fa FASTA DNA 3 4,866,996 59,351 1,622,332 4,734,139
./1919D3.fa FASTA DNA 3 5,024,825 95,543 1,674,941.7 4,710,181
./1919D62.fa FASTA DNA 3 4,969,394 114,129 1,656,464.7 4,669,207
./HNCF11W.fa FASTA DNA 3 4,772,542 57,105 1,590,847.3 4,584,794
./HN-P16.fa FASTA DNA 122 4,856,932 237 39,810.9 405,887
./HN-P19.fa FASTA DNA 123 5,062,270 211 41,156.7 617,104
./HN-PA19.fa FASTA DNA 126 5,062,176 211 40,176 617,104
./PKP2180.fa FASTA DNA 257 4,752,044 100 18,490.4 426,081
./PKP2181.fa FASTA DNA 289 4,797,585 100 16,600.6 305,758
./PKP2182.fa FASTA DNA 250 4,863,924 100 19,455.7 335,109
./PKP2183.fa FASTA DNA 190 4,951,936 100 26,062.8 464,651
./T16R.fa FASTA DNA 4 5,035,953 33,309 1,258,988.3 4,639,361
./T28R.fa FASTA DNA 4 5,025,884 65,023 1,256,471 4,606,706

21. 查找当前目录及子目录下的sh后缀文件

find . -type f -name "*.sh"

22.解决no such file问题

dos2unix temp_column.txt

23. 提取所需contig

awk -v RS=">" 'NR>1 {print RS $0}' *.fa > temp_output.txt
awk '{print $2}' 30.txt > temp_column.txt
bash danduxulie.sh
dos2unix temp_column.txt
for i in `cat temp_column.txt`;do cp ./roll/$i.fasta ./30_tiqu_contig/;done

24. 采用find命令根据文件大小进行筛选操作

# 筛选大于150 kB 小于250kB的文件进行复制
find . -type f -size + 150k -size - 250k -exec cp {} 200KB/ \;

#筛选小于100 kB的文件进行复制
find . -type f -size -100k -exec cp {} 100KB/ \;

25. 采用seqkit grep命令根据CDS名称提取氨基酸序列

while read line ; do
	
	num=$(echo $line | awk '{print $1}')
	CDS=$(echo $line | awk '{print $2}')
	
	seqkit grep -n -r -p  $CDS  ./$num/*.faa > ./LXG_seq/${num}_${CDS}.fa

done < ../../LXG_L_SC_ipout.tsv

26. Spades二代拼接

# 一个是--carfule
7794  for i in `ls` ;do spades.py -m 32 -t 8 -k 21,33,55,77,99 --careful --isolate --pe1-1 /mnt/sdb1/share/XINGHONGJIE/2_jinpu_2+3th_20240529/${i}/2_HQData/${i}_HQ_R1.fq.gz --pe1-2 /mnt/sdb1/share/XINGHONGJIE/2_jinpu_2+3th_20240529/${i}/2_HQData/${i}_HQ_R2.fq.gz -o /mnt/sdb1/HAU/dxd/work/xing_hongjie/2_jinpu_2+3th_20240529/spades_2th_1/${i}_SPAdes --cov-cutoff auto ; cp /mnt/sdb1/HAU/dxd/work/xing_hongjie/2_jinpu_2+3th_20240529/spades_2th_1/${i}_SPAdes/scaffolds.fasta /mnt/sdb1/HAU/dxd/work/xing_hongjie/2_jinpu_2+3th_20240529/spades_2th_1/${i}.fa; done

# 一个是--isolate
 7795  for i in `ls` ;do spades.py -m 32 -t 8 -k 21,33,55,77,99 --isolate --pe1-1 /mnt/sdb1/share/XINGHONGJIE/2_jinpu_2+3th_20240529/${i}/2_HQData/${i}_HQ_R1.fq.gz --pe1-2 /mnt/sdb1/share/XINGHONGJIE/2_jinpu_2+3th_20240529/${i}/2_HQData/${i}_HQ_R2.fq.gz -o /mnt/sdb1/HAU/dxd/work/xing_hongjie/2_jinpu_2+3th_20240529/spades_2th_1/${i}_SPAdes --cov-cutoff auto ; cp /mnt/sdb1/HAU/dxd/work/xing_hongjie/2_jinpu_2+3th_20240529/spades_2th_1/${i}_SPAdes/scaffolds.fasta /mnt/sdb1/HAU/dxd/work/xing_hongjie/2_jinpu_2+3th_20240529/spades_2th_1/${i}.fa; done

28 针对三代数据修改>后的行内容为文件名

for file in ./*; do sed -i "s/>.*/>${file##*/}/g" "$file"; done

29 LXG序列提取 (即根据CDS名从prokka faa文件中提取序列)

while read line ; do
	
	num=$(echo $line | awk '{print $1}')
	CDS=$(echo $line | awk '{print $2}')
	
	seqkit grep -n -r -p  $CDS  ./prokka/$num/*.faa > ./LXG_seq/${num}_${CDS}.fa

done < ./LXG_huazhong_ipout.tsv

30 将文件名作为第一列内容输入进文件内容中

for file in *ipout; do     awk -v filename="$file" '{print filename "\t" $0}' "$file" > "./xiugai/$file"; done

31 为李昊做的CmeB的提取,因为要翻译成AA序列所以涉及到反转的问题

mkdir ncbi_exc_NN

cat ./ab_CmeB/CmeB/*.fa.txt > ncbi_exc_NN/res_sum.txt

cat ./ncbi_exc_NN/res_sum.txt | grep -v "#" | awk '{print $1,$6,$2,$3-1,$4,$5}' | sed 's#/#_#g' | cat > ncbi_exc_NN/id.txt

split -l 1 ncbi_exc_NN/id.txt -d ncbi_exc_NN/roll

for i in ` ls ncbi_exc_NN/roll* |sed 's#ncbi_exc_NN/##g' `
do
	str=$(cat ncbi_exc_NN/$i |awk '{print $1}' |sed 's/.fa//g')
	res=$(cat ncbi_exc_NN/$i  |awk '{print $2}')
	start=$(cat ncbi_exc_NN/$i | awk '{print $4}')
	strand=$(cat ncbi_exc_NN/$i | awk '{print $6}')
	mkdir -p ncbi_exc_NN/$str
	cat ncbi_exc_NN/$i |awk -v OFS="\t" '{print $3,$4,$5}' |cat >ncbi_exc_NN/${str}_${res}_${start}_id.txt
	cp ncbi_exc_NN/${str}_* ncbi_exc_NN/${str}/ 
	seqkit subseq --bed ncbi_exc_NN/${str}/${str}_${res}_${start}_id.txt ./206_fa/$str.fa -o ncbi_exc_NN/$str/${res}_${str}_${start}_${strand}.fa
done

rm ./ncbi_exc_NN/roll*
rm ./ncbi_exc_NN/*_id.txt

mkdir ncbi_exc_NN_all
find ./ncbi_exc_NN/ -name "*.fa" -exec cp {} ./ncbi_exc_NN_all/ \;