from math import *
import re
import sys

fname = sys.argv[1]

AAdict = {'ALA':'NCCOC','ARG':'NCCOCCCNCNN','ASN':'NCCOCCON','ASP':'NCCOCCOO','CYS':'NCCOCS',
'GLN':'NCCOCCCON','GLU':'NCCOCCCOO','GLY':'NCCO','HIS':'NCCOCCNCCN','ILE':'NCCOCCCC',
'LEU':'NCCOCCCC','LYS':'NCCOCCCCN','MET':'NCCOCCSC','PHE':'NCCOCCCCCCC','PRO':'NCCOCCC',
'SER':'NCCOCO','THR':'NCCOCOC','TRP':'NCCOCCCCNCCCCC','TYR':'NCCOCCCCCCCO','VAL':'NCCOCCC'}


# create file objects
f = open(fname,'r')
h = open('Errors.txt','w')			
error_info = 0			#to confirm if there are errors or not for last line
checker=0				#checks that each amino acid insertion is only flagged once
h20_count=0				#various counters
zn_count=0          
aa_counter=0
aa_changes=0			#for aa counter use
x1 = 0					#For insertion checking
y1 = 0
z1 = 0
endcheck=0			# so doesnt flag up end residues as missing atoms when OXT atom on the end of chain
lineoldCA=0				#For flagging of disordered residues
last = current = ''			
AAstr = ''
letter=re.compile('[A-Z]') # regular expression to find any capital letter

for line in f:
	if line[0:6]==("HETATM"):
		if line[17:20] =='HOH':
			h20_count +=1
		if line[18:20] =='ZN':
			zn_count +=1
	if line.startswith("ATOM"):				
		aa_checker_count = line[22:28]
		if aa_changes != aa_checker_count:		#count amino acids
			aa_counter +=1
			aa_changes = aa_checker_count
		if letter.search(line[26]):			#checks for inserted residues
			insert_repeat=line[23:27]
			if insert_repeat != checker:
				h.write("\nInserted residue:  ")
				h.write(line[17:27]+'\n')
				error_info = 1
				checker = insert_repeat					
		if line[13:16] =='CA ':					#Finds alpha carbons and checks distances 
			x2 = float(line[30:38])				#between consecutive ones is 3.8 angstroms+/- 0.2
			y2 = float(line[38:46])
			z2 = float(line[46:54])
			chainnew=line[21]					#checks if alpha carbons from same chain
			distance_calc = sqrt(pow((x2-x1),2)+pow((y2-y1),2)+pow((z2-z1),2))
			if distance_calc > 4.0:
				x1=x2
				y1=y2
				z1=z2
				if line[6:11] != "    2":		#so doesnt flag first alpha carbon
					if chainold==chainnew:
						h.write ("\nDisorder between consecutive residues!:\n")			
						h.write(lineoldCA+line)
						if line[22:26]<lineoldCA[22:26]:
							h.write('This may have occurred due to the abscence of a chain identifier in the pdb file. Please check this above.\n')
						error_info = 1
						chainold=chainnew
						lineoldCA=line
					
			elif distance_calc< 3.6:
				x1=x2
				y1=y2
				z1=z2
				if line[6:11] != "    2":		#so doesnt flag first alpha carbon
					if chainold==chainnew:
						if line[17:20] !='PRO':		#proline is allowed to be <3.6A away
							h.write ("\nDisorder between consecutive residues!:\n")			
							h.write(lineoldCA+line)
							error_info = 1
							chainold=chainnew
							lineoldCA=line
			else:
				x1=x2
				y1=y2
				z1=z2
				chainold=chainnew
				lineoldCA=line
		current = line[23:27]
		if line[17:20] in AAdict.keys():		#checks AA atoms against dictionary
			if last == '' or last == current:	# to check all atoms are there
				if line[13] !='H':
					AAstr = AAstr+line[13]
					AAterm = line[17:22]
					endcheck = line[13:16]
			else:
				if AAstr not in AAdict.values():
					error_info = 1
					if endcheck !='OXT':
						h.write('\nMissing atoms in:  '+AAterm+' '+last+'\n')
				AAstr = line[13]
			last = current		
if error_info == 0:
	h.write("No errors were detected, go ahead and convert!")


h20_count_str =str(h20_count)    #string conversion for writing to file
h.write ('\nThere are '+ h20_count_str +' water molecules in the pdb file\n')

zn_count_str=str(zn_count)
if zn_count>0:
	h.write ('\nThere are '+ zn_count_str +' zinc molecules in the pdb file\n')

aa_count_str =str(aa_counter)
h.write ('\nThere are '+ aa_count_str +' amino acids in the pdb file\n')

h.close()
f.close()


