在Python中使用标志时奇怪的输出
我目前正在用Python编写一个脚本,它需要一些标志。 这是我第一次尝试这样的程序,并且我从bash脚本中得到了一个我不太明白的输出。 例如,当我在bash shell中运行脚本时:
$ python my_script.py -f <input_file.txt> -k <test_string> -c <user_input>
我在脚本输出之前得到这个输出:
usage: rm [-f | -i] [-dPRrvW] file ...
unlink file
我似乎无法摆脱这一点,这对于产出的美观感到沮丧。 任何帮助将是伟大的!
我正在使用的代码:
import sys, getopt, re, subprocess, collections, itertools
def find_kmers( arguments=sys.argv[1:] ):
required_opts = ['-f','-c','-k']
opts, args = getopt.getopt(arguments,'f:k:c:')
opt_dic = dict(opts)
for opt in required_opts:
if opt not in opt_dic:
return "incorrect arguments, please format as: python_script.py -f <filename> -k <kmer> -c <chromosome_name>"
def rev_comp(sequence):
reversed_dic = {'A':'T','T':'A','C':'G','G':'C'}
return ''.join(reversed_dic[_] for _ in sequence[::-1])
kmer = opt_dic['-k']
subprocess.call(['bash','-c',"grep '>' S288C_R64.fasta > grep.tmp"])
chromosomes = [_[1:].strip() for _ in open('grep.tmp')]
subprocess.call(['bash','-c','rm','grep.tmp'])
found = False
if any(opt_dic['-c']==_ for _ in chromosomes):
found = True
def get_sequence(file):
sequence = ''
for line in file:
if line.startswith('>'): break
sequence += line.strip()
return sequence.upper()
ofile = open(opt_dic['-f'])
if found == True:
for line in ofile:
if line.startswith('>'):
if line[1:].strip() == opt_dic['-c']:
sequence = get_sequence(ofile)
break
else:
return 'chromosome not found in %s. n chromosomes in file are:%s'%(opt_dic['-f'],', '.join(str(_) for _ in chromosomes))
kmer_matches1 = re.finditer('(?=%s)'%opt_dic['-k'],sequence)
kmer_matches2 = re.finditer('(?=%s)'%opt_dic['-k'],rev_comp(sequence))
def print_statement(start,strand):
return '%sthw1_scripttkmer=%st%st%st.t%st.tID=S288C;Name=S288Cn'%(opt_dic['-c'],opt_dic['-k'],start,start+len(opt_dic['-k'])-1,strand)
pos_strand = collections.deque()
neg_strand = collections.deque()
for match1,match2 in itertools.izip(kmer_matches1,kmer_matches2):
pos_strand.append(match1.start()+1)
neg_strand.append(match2.start()+1)
wfile = open('answer.gff3','w')
while len(pos_strand)>0 and len(neg_strand)>0:
if pos_strand[0]<neg_strand[0]:
start = pos_strand.popleft()
wfile.write(print_statement(start,'+'))
else:
start = neg_strand.popleft()
wfile.write(print_statement(start,'-'))
while len(pos_strand)>0:
start = pos_strand.popleft()
wfile.write(print_statement(start,'+'))
while len(neg_strand)>0:
start = neg_strand.popleft()
wfile.write(print_statement(start,'-'))
wfile.close()
return 'percent-GC = %s'%str(sum(sequence.count(gc) for gc in ["G","C"])/float(len(sequence)))
if __name__ == '__main__':
print find_kmers()
调用bash
单行程序要求bash命令是单个字符串。 更改:
subprocess.call(['bash','-c','rm','grep.tmp'])
至:
subprocess.call(['bash', '-c', 'rm grep.tmp'])
或者更合理的是,不要为此使用子流程,只需要:
os.unlink('grep.tmp') # Or os.remove; same thing, different names
这是更快,更容易出错。
事实上,你所有的子进程使用都可以用真正的Python代码来替代,并且可以大幅改进它(大部分Python代码也可以简化):
def find_kmers( arguments=sys.argv[1:] ):
required_opts = ['-f','-c','-k']
opts, args = getopt.getopt(arguments,'f:k:c:')
opt_dic = dict(opts)
for opt in required_opts:
if opt not in opt_dic:
return "incorrect arguments, please format as: python_script.py -f <filename> -k <kmer> -c <chromosome_name>"
def rev_comp(sequence):
reversed_dic = {'A':'T','T':'A','C':'G','G':'C'}
return ''.join(reversed_dic[_] for _ in sequence[::-1])
kmer = opt_dic['-k']
# Replaces grep with temp file with trivial Python equivalent
with open('S288C_R64.fasta') as f:
chromosomes = [line[1:].strip() for line in f if '>' in line]
# No need for any loop when just checking for exact value
if opt_dic['-c'] not in chromosomes:
return 'chromosome not found in %s. n chromosomes in file are:%s'%(opt_dic['-f'],', '.join(str(_) for _ in chromosomes))
def get_sequence(file):
sequence = ''
for line in file:
if line.startswith('>'): break
sequence += line.strip()
return sequence.upper()
with open(opt_dic['-f']) as ofile:
for line in ofile:
if line.startswith('>'):
if line[1:].strip() == opt_dic['-c']:
sequence = get_sequence(ofile)
break
kmer_matches1 = re.finditer('(?=%s)'%opt_dic['-k'],sequence)
kmer_matches2 = re.finditer('(?=%s)'%opt_dic['-k'],rev_comp(sequence))
def print_statement(start,strand):
return '%sthw1_scripttkmer=%st%st%st.t%st.tID=S288C;Name=S288Cn'%(opt_dic['-c'],opt_dic['-k'],start,start+len(opt_dic['-k'])-1,strand)
pos_strand = collections.deque()
neg_strand = collections.deque()
for match1,match2 in itertools.izip(kmer_matches1,kmer_matches2):
pos_strand.append(match1.start()+1)
neg_strand.append(match2.start()+1)
with open('answer.gff3','w') as wfile:
while pos_strand and neg_strand:
if pos_strand[0]<neg_strand[0]:
start = pos_strand.popleft()
wfile.write(print_statement(start,'+'))
else:
start = neg_strand.popleft()
wfile.write(print_statement(start,'-'))
for start in pos_strand:
wfile.write(print_statement(start,'+'))
for start in neg_strand:
wfile.write(print_statement(start,'-'))
return 'percent-GC = %s'%str(sum(sequence.count(gc) for gc in ["G","C"])/float(len(sequence)))
链接地址: http://www.djcxy.com/p/42429.html