diff --git a/check_add_vr.py b/check_add_vr.py new file mode 100644 index 0000000..b6b9496 --- /dev/null +++ b/check_add_vr.py @@ -0,0 +1,188 @@ +''' + This code checks each entry in the dicom_ontology.owl file for an + explicit VR assignment. The issue is that the "new" entries - + the entries/tags not in the original set that were scraped from + the NEMA website and instantiated in Neurolex - do not have + VR values attached. The reason for this is that in the DICOM + standard documentation a listing of all of the tags are in Parts + 06 and 07, but these lists do not include the definitions. The + definitions are given in tables in other Parts, so have to + be extracted from those tables. + + This code checks to see if an entry specifies the VR value and + if not, retrieves it from the file + /home/karl/Work/INCF/XML_code/dicom_dict_vr.py + which was created by the code: + /home/karl/Work/INCF/XML_code/vr_generate_dict.py + + This code skips reads lines until it finds + a line containing "Datatype Properties" and continues to read lines + until is finds a line that contains "dicom#dicom". It collects + lines in the variable "entry" until it encounters a line containing + " ." which is the ending character for an entry. If it finds + a line containing "dicom:VR" then it writes the entry after removing + an extra blank line that was written into the original owl file as + the line previous to the line containing the VR info. The code then + writes that entry to the output file and then reads on, collecting the + next entry. If the entry does not contain VR line, it queries the + dicom_dict_vr.py file to find the VR value and writes a line + containing the VR value in the correct format. + + Note that since the code starts collecting entries with the "Datatype + Properties" section, everything above that needs to be pasted into + the resulting file to make a complete owl file. Also, the Class section + will be unchanged since there are no VR values for these terms + (since they are not official DICOM tags, but terms I extracted from the + official Part documents). The code checks for the presence of "xxxx" in + the dicom tag. If that string is in the tag then just write the entry + unchanged into the output file. + + +Sample Complete Entry +------------------------------------------------ +### http://purl.org/nidash/dicom#dicom_00280011 + +dicom:dicom_00280011 rdf:type owl:DatatypeProperty ; + + rdfs:label "Columns"^^xsd:string ; + + obo:IAO_0000114 obo:IAO_0000428 ; + + obo:IAO_0000115 "Number of columns in the image."^^xsd:string ; + + dicom:dicom_xxxx0065 "(0028,0011)"^^xsd:string ; + + dicom:VR "US"^^xsd:string ; + + rdfs:subClassOf dc:identifier . + + +2018-09-04 - started +2018-09-14 - ran on full owl file and checked result into GitHub repo + +Karl Helmer +Athinoula A. Martinos Center for Biomedical Imaging +Massachusetts General Hospital, 2018 + +''' + +import os, sys +import re +import ast + +#************************************************ +#input parameters +inDir = '/home/karl/Work/INCF/dicom-ontology/' +inFilename = 'dicom_ontology.owl' +outDir = '/home/karl/Work/INCF/dicom-ontology/' +outFilename = 'dicom_ontology_new.owl' +vrDir = '/home/karl/Work/INCF/XML_code/' +vrFilename = 'dicom_dict_vr.dict' +startEntry = 'dicom#dicom' +endEntry = ' .' +startPlace = 'Datatype Properties' +#************************************************ + +def search_vr(entry): + #make 1 string rather than searching in each string individually + #this is faster than using some version of "any" + combined = ' '.join(entry) + test = 'dicom:VR' in combined + + return test + + +def get_tag(entry): + + # check each line in entry for the dicom tag + for e in entry: + if 'rdf:type' in e: + t = re.search('dicom_(.+?) ', e) + tag = t.group(1) + if tag: + #print "The DICOM tag is: ", tag + return tag + else: + #this will crash the program since no tag value is returned + print "no dicom tag value found in: ", entry + + +def get_vr(vrDir, vrFilename, tag): + if tag: + vrF = open(vrDir+vrFilename, 'r').read() + dicomDict = ast.literal_eval(vrF) + if tag: + vr = dicomDict.get(tag,'')[0] + #print "vr value is = ", vr + return vr + else: + print "vr value not found for tag = ", tag + return None + + +def add_vr_to_entry(vr, entry): + vrLine = ' dicom:VR "{}"^^xsd:string ;\n'.format(vr) + entry.insert(4,vrLine) + entry.insert(5,'\n') + + return entry + + +def write_entry(entry, outFile): + for line in entry: + outFile.write(line) + outFile.write('\n\n') + + +def remove_sequential_blanks_in_entry(entry): + for i in range(len(entry)-1): + if entry[i] == entry[i+1]: + del entry[i] + break + + return entry + + + +def main(): + + # open the dicom ontology file and start reading + with open(inDir+inFilename, 'r') as inFile, open(outDir+outFilename, 'w') as outFile: + entry = [] + copy = False + dt = False + for line in inFile: + if startPlace in line: #find "Datatype Properties" line and start here + dt = True + print "starting place is:", startPlace + + if dt == True: #start check after Datatype Prop line + if startEntry in line: + copy = True + #print 'start of entry' + if endEntry in line: + copy = False + entry.append(line) #append the last line of entry + #print 'end of entry' + + #now check the entry list as a whole + vrFlag = search_vr(entry) #see if the entry has a VR line + tag = get_tag(entry) #extract the tag from entry + print tag, vrFlag + if vrFlag == False and ("xxxx" not in tag): + vr = get_vr(vrDir, vrFilename, tag) #get vr value from the dict + entry1 = add_vr_to_entry(vr, entry) + write_entry(entry1, outFile) #write entry with added vr to outfile + entry = [] #clear entry list when finished + else: + entry2 = remove_sequential_blanks_in_entry(entry) + write_entry(entry2, outFile) #write unchanged entry to outfile + entry = [] + elif copy: + entry.append(line) + + + +############################################################## +if __name__ == "__main__": + main() diff --git a/create_dicom_ttl.0.4.py b/create_dicom_ttl.0.4.py new file mode 100644 index 0000000..ea020ba --- /dev/null +++ b/create_dicom_ttl.0.4.py @@ -0,0 +1,401 @@ +''' + This code takes the DICOM terms from the DICOM XML docbook + (provided by David Clunie) and the CSV file from the DICOM terms + from Neurolex and creates a basic turtle file. Note that in the + definitions there are XML links that don't show up in the extracted + text file from the DICOM docbook. There are then phrases like + "See " that need to be removed from the definitions at the very end. + +ver 0.1 2017-03-14 - original; terms are camelcase labels +ver 0.2 2017-03-28 - retrieve Neurolex ID's using term labels +ver 0.3 2017-03-29 - retrieve neurolex ID using DICOM tags +ver 0.4 2017-04-19 - change ID system to non-tag-based ID's + reserve first 500 for other terms, rest for tags + +Karl Helmer +Athinoula A. Martinos Center for Biomedical Imaging +Massachusetts General Hospital + +''' + +import os, sys +import re +from operator import itemgetter +import pickle + +#************************************************ +#input parameters +#outDir = '/home/karl/Work/INCF/nidm/nidm/nidm/nidm-experiment/imports/' +outDir = '/home/karl/Work/INCF/dicom-ontology/' +outFile = 'dicom_numericalID.ttl' +tagDefFile = 'all_tag_definition.txt' +# The following file version is the one that replaces the greek mu with "u" +# Use of mu means dealing with unicode processing +inFile = '/home/karl/Work/INCF/dicom-ontology/Clunie_DICOM_definitions-us.txt' +nlxFile = '/home/karl/Work/INCF/dicom-ontology/Neurolex_dicom_terms_result.csv' +dicomNS = 'dicom:' +dicomPrefix = 'dicom_' +rdfType = 'rdf:type' +owlClass = 'owl:Class' +owlDatatypeProperty = 'owl:DatatypeProperty' +owlSameAs = 'owl:sameAs' +rdfsLabel = 'rdfs:label' +rdfsSub = 'rdfs:subClassOf' +dicomTag = dicomNS+'dicom_00000065'#'Tag' +vrInDicom = dicomNS+'VR' +nlxID = 'nidm:neurolexID' +dcID = 'dc:identifier' +labelStr = 'label' +subClass = 'subClassOf' +provNS = 'prov:' +xsdString = '^^xsd:string ' +definitionStr = 'obo:IAO_0000115' +editorNote = 'obo:IAO_0000116 "To be discussed."' +curationStatusReady = 'obo:IAO_0000114 obo:IAO_0000122 ' +curationStatusReqDisc = 'obo:IAO_0000114 obo:IAO_0000428 ' +classLink = 'http://purl.org/nidash/dicom#' +nlxLink = 'http://uri.neuinfo.org/nif/nifstd/' +idStart = 500 +#************************************************ + +def write_ontology_header(ttlFile): + + ttlFile.write("@prefix : .\n") + ttlFile.write("@prefix owl: .\n") + ttlFile.write("@prefix rdf: .\n") + ttlFile.write("@prefix xml: .\n") + ttlFile.write("@prefix xsd: .\n") + ttlFile.write("@prefix rdfs: .\n") + ttlFile.write("@prefix nidm: .\n") + ttlFile.write("@prefix dc: . .\n") + ttlFile.write("@prefix obo: .\n") + ttlFile.write("@prefix nlx: .\n") + ttlFile.write("@base .\n") + ttlFile.write("\n") + ttlFile.write(classLink+"[ rdf:type owl:Ontology ] .\n") + + +def write_class_header(ttlFile): + ttlFile.write('\n') + ttlFile.write('#################################################################\n') + ttlFile.write('#\n') + ttlFile.write('# Datatype Properties\n') + ttlFile.write('#\n') + ttlFile.write('#################################################################\n') + ttlFile.write('\n') + + +# The following two functions are used to create camelCase version of DICOM tag label +#def repl_func(m): +# """process regular expression match groups for word upper-casing problem""" +# return m.group(1) + m.group(2).upper() + + +#def create_camelcase_label(s): +# '''Capitalizes each word, removes non-alphanumeric characters +# and spaces from the label ''' +# s = re.sub("(^|\s)(\S)", repl_func, s) +# s = re.sub('[^a-zA-Z0-9]+',"", s) +# s.replace(" ", "") +# if s[0].isalpha: +# s = s[0].lower() + s[1:] +# +# return s + + +# The following two functions are used to match term labels from the two input files +# Used in string_match function +def max_list_value(list,i): + # this function returns a tuple of the (index, maxValue) for a list + # you supply the list and the index of the place within the list that you + # want the max of. + return max(enumerate(sub[i] for sub in list), key=itemgetter(1)) + + +def string_match(label,nlxData): + # this code takes an input string (dicom tag label) and tries to find an + # exact match in another list of labels. If no exact match is found, finds + # the closest match from a list of labels in which there is at least one + # match between the original label and the possible label. + + # find the length of the list of possible labels + neuroLines = len(nlxData) + exactMatch = 'False' + noMatch = 'True' + + print "considering DICOM file label = "+label + + for i in range(neuroLines): + partMatch = 'False' + tempStore = [] + # check for exact match + #print "DICOM_label=",label, " nlxData_label=",nlxData[i][0] + if nlxData[i][0] == label: + vrCode = nlxData[i][3] + dicomTagID = nlxData[i][2] + neurolexID = nlxData[i][1] + print "match for ", label + exactMatch = 'True' + noMatch = 'False' + break + # if no exact match, find how many words in the orig label are in the possible label + # if none, go to next nlxLabel in nlxData + else: + matchCount = 0 + labelPart = label.split() + filteredLabelPart = [s for s in labelPart if len(s) > 2] #don't match 2-or-less length words + for lp in filteredLabelPart: + #print lp + if lp in nlxData[i][0]: + matchCount = 1+matchCount + + # if at least one matching word, store needed info as list in list tempStore + if matchCount != 0: + partMatch = 'True' + tempStore.append([nlxData[i][0], nlxData[i][2], nlxData[i][1], nlxData[i][3], matchCount]) + + if (partMatch == 'True') and (exactMatch == 'False'): + print "Dicom label = "+label+"\n" + print "Neurolex entry = ", tempStore + + if len(tempStore) > 1: #if only a single term matches then assume that it's not a match + isMatch = input("Is this a match (1/0)?") + if isMatch: + print 'partial match for '+label + maxAndWhere = max_list_value(tempStore,-1) # tuple + print maxAndWhere + k=maxAndWhere[0] #put the index of the best match into k + # put values for best match into variables for return + neurolexID = tempStore[k][2] + dicomTagID = tempStore[k][1] + vrCode = tempStore[k][3] + noMatch = 'False' + else: + partMatch = 'False' + else: + partMatch = 'False' + + if (partMatch == 'False') and (exactMatch == 'False'): + noMatch = 'True' + neurolexID = 'NF' + dicomTagID = 'NF' + vrCode = 'NF' + print "no match for "+label + + return neurolexID, dicomTagID, vrCode, noMatch + + + +def tag_match(tag,nlxData): + ''' + This code takes an input string (dicom tag) and tries to find an + exact match in another list of labels. The two strings have different + initial formats so first have to put them in common format (8char string, + no non-alphanumeric characters) + ''' + + # set match status flags + neuroLines = len(nlxData) + noMatch = 'True' + neurolexID = 'NF' + vrCode = 'NF' + + # get the DICOM tag from the Clunie file in the format (XXXX,XXXX) + dicomTagIDGroup = re.search(r'.*\(([A-Za-z0-9\,]*)\)', tag) + if not dicomTagIDGroup: + print "bad dicom tag format for: "+tag + else: + dicomTagPartsList = dicomTagIDGroup.group(1).split(",") + dicomTagID = dicomTagPartsList[0]+dicomTagPartsList[1] + #print dicomTag + + for i in range(neuroLines): + # This assumes that the correctly formatted tag is present + # (already checked in main) + # To get the from XXXX_XXXX to XXXXXXXX + nlxDicomTagPartsList = nlxData[i][2].split("_") + nlxDicomTagID = nlxDicomTagPartsList[0]+nlxDicomTagPartsList[1] + #print nlxDicomTag + + # check for exact match + if nlxDicomTagID == dicomTagID: + vrCode = nlxData[i][3] + neurolexID = nlxData[i][1] + noMatch = 'False' + break + else: + pass + + + if noMatch == 'True': + print "no match for "+dicomTagID + else: + print "match for "+dicomTagID + + return neurolexID, dicomTagID, vrCode, noMatch + + + +def main(): + nlxData = [] + neurolexID = '' + dicomTagID = '' + vrCode = '' + ttlFile = open(outDir+outFile, "w") + + write_ontology_header(ttlFile) + write_class_header(ttlFile) + + # Neurolex/Interlex section***************************** + # put the label, Neurolex ID (if present), DICOM ID, and VR into a file that will be + # matched up to the label from the DICOM (Clunie-supplied) file + nlxFileData = open(nlxFile, "r") + entries = nlxFileData.readlines() + for entry in entries: + + dicomIDGroup = re.search(r'.*DICOM:([A-Za-z0-9\_]*),', entry) + if not dicomIDGroup: + print "no dicom ID found in: ", entry + dicomID = "NF " + else: + dicomID = dicomIDGroup.group(1) + #print dicomID + + + nlxIDGroup = re.search(r'.*,(nlx_[0-9]*),', entry) + if not nlxIDGroup: + print "no nlx ID found in: ", entry + nlxID = "NF " + else: + nlxID = nlxIDGroup.group(1) + #print nlxID + + + vr = entry[-3:].rstrip("\n") #get rid of newline character + vrGroup = re.search(r'(\"\,*)', vr) + if vrGroup: + if "US or SS" in entry: + vr = "US or SS" + elif "OB or OW" in entry: + vr = "OB or OW" + elif "OW or OB" in entry: + vr = "OB or OW" + elif "OP or OW" in entry: + vr = "OP or OW" + elif "US,SS,or OW" in entry: + vr = "US or SS" + elif "US or SS or OW" in entry: + vr = "US or SS or OW" + elif "does not exist" in entry: + vr = "does not exist" + else: + print "bad or missing VR value found in: ", entry + vr = "NF " + else: + vr = vr + + #vr = vr.rstrip("\n") #get rid of the newline character that appears + + # problem her is that sometimes there are "" around Category sometimes not + dicomLabelGroup = re.search(r'.*:Category:([A-Za-z0-9\s\-\/\(\)\'\&\"]*),', entry) + if not dicomLabelGroup: + print "no dicom label found in: ", entry + dicomLabel = "NF " + else: + dicomLabel = dicomLabelGroup.group(1) + if dicomLabel[-1] == '"': + dicomLabel = dicomLabel[:-1] + #print dicomLabel + + # store extracted strings in a list for future retrieval - this is all relevant NLX data + nlxData.append([dicomLabel, nlxID, dicomID, vr]) + + + # DICOM document section************************************ + # get the label, tag, definition for each term + tagList = [] + multiTags = [] + allEntries = [] + idStart = 500 + dicomFileData = open(inFile, "r") + lines = dicomFileData.readlines() + for line in lines: + + # create a 5 digit ID with leading zeros to ID the tags + idStart = idStart + 1 + numericalTagID = str(idStart).zfill(5) + + # get the label + labelGroup = re.search(r'.*Name="([A-Za-z0-9\s\-\/\(\)\'\&]*)"\t', line) + label = labelGroup.group(1) + # get the tab + tagGroup = re.search(r'.*Tag=("[A-Za-z0-9\s\,\(\)]*")\t', line) + tag = tagGroup.group(1) #left the quotes around the tag + # get the definition + definitionGroup = re.search(r'.*Description=(".*)', line) + definition = definitionGroup.group(1) # has quotes already + + # find the corresponding term from the extracted Neurolex info + #neurolexID, dicomTagID, vrCode, noMatch = string_match(label,nlxData) + neurolexID, dicomTagID, vrCode, noMatch = tag_match(tag,nlxData) + + #tempList = [dicomTagID, definition] + #allEntries.append(tempList) + + # determine which tags have multiple entries and create a non-repeating list + # of the multiple-entry tag (multitags). tagList is a non-repeating list of all tags. + # {just store tag} + #if dicomTagID in tagList and dicomTagID not in multiTags: + # multiTags.append(dicomTagID) + #else: + # tagList.append(dicomTagID) + + #{store all multiple tags and their definitions} - HOW TO STORE FIRST ONE OF MULTIPLE? + # look at each tag in turn and all tags after that tag. + #if dicomTagID in tagList and dicomTagID not in multiTags: + # tempList = [dicomTagID, definition] + # multiTags.append(tempList) + #else: + # tagList.append(dicomTagID) + + #labelCC = create_camelcase_label(label) + #print label + ttlFile.write("### "+classLink+dicomPrefix+numericalTagID+"\n") + ttlFile.write("\n") + ttlFile.write(dicomNS+dicomPrefix+numericalTagID+" "+rdfType+" "+owlDatatypeProperty+" ;\n") + ttlFile.write("\n") + ttlFile.write(" "+rdfsLabel+" "+'"'+label+'"'+xsdString+";\n") + ttlFile.write("\n") + ttlFile.write(" "+curationStatusReqDisc+";\n") + ttlFile.write("\n") + ttlFile.write(" "+definitionStr+" "+definition+xsdString+";\n") + ttlFile.write("\n") + ttlFile.write(" "+dicomTag+" "+tag+xsdString+";\n") + ttlFile.write("\n") + + if noMatch == 'False': + ttlFile.write(" "+owlSameAs+" "+nlxID+" ;\n") + ttlFile.write("\n") + ttlFile.write(" "+vrInDicom+" "+'"'+vrCode+'"'+xsdString+" ;\n") + ttlFile.write("\n") + ttlFile.write(" "+rdfsSub+" "+dcID+" .\n") + else: + ttlFile.write(" "+rdfsSub+" "+dcID+" .\n") + + ttlFile.write("\n") + + ttlFile.write("\n") + ttlFile.write("\n") + + ttlFile.close() + + #print multiTags + #print len(multiTags) + + # write out the list of all tag and defs for later sorting + #with open(outDir+tagDefFile, "wb") as fp: + # pickle.dump(allEntries,fp) + #fp.close() +############################################################## +if __name__ == "__main__": + main() diff --git a/vr_generate_dict.py b/vr_generate_dict.py new file mode 100644 index 0000000..bb8587c --- /dev/null +++ b/vr_generate_dict.py @@ -0,0 +1,249 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# generate_dict_2015b.py + +""" + Reformat a dicom dictionary PS3.6 and PS3.7 docbook xml files (from e.g. standard docs) to Python syntax + Write the main DICOM dictionary elements as a python dict called main_attributes with format: + Tag: ('VR', 'VM', "Name", 'is_retired', 'Keyword') + Where + Tag is a 32-bit representation of the group, element as 0xggggeeee (e.g. 0x00181600) + VR is the Value Representation (e.g. 'OB' or 'OB or UI' or 'NONE') + VM is the Value Multiplicity (e.g. '1' or '2-2n' or '3-n' or '1-32') + Name is the DICOM Element Name (or Message Field for Command Elements) (e.g. 'Tomo Time' or 'Retired-blank' or 'Time Source') + is_retired is '' if not retired, 'Retired' otherwise (e.g. '' or 'Retired') + Keyword is the DICOM Keyword (e.g. 'TomoTime' or 'TimeSource') + Also write the repeating groups or elements (e.g. group "50xx") as a python dict called + mask_attributes as masks that can be tested later for tag lookups that didn't work + using format: + 'Tag': ('VR', 'VM', "Name", 'is_retired', 'Keyword') + Where + Tag is a string representation of the element (e.g. '002031xx' or '50xx0022') +""" + +# Based on Rickard Holmberg's docbook_to_dict2013.py +# http://code.google.com/r/rickardholmberg-pydicom/ +# but rewritten for not using bs4 (and slight change for standard v2015b) + +# Based on Rickard Holmberg's generate_dict_2015b.py - found online as part of the "pydicom" package. +# Note that this doesn't grab the definitions - the dictionary in Part 06 doesn't +# include them in the table that I pull info from. This code is used to generate a python +# dict that contains tags and VR's. I keep the code to get latest docbook from URL, +# but currently pull from offline/local version of latest docbook so I don't have to be online. +# Also not that this code pulls from the tables in Part 06 and the "Command Fields" +# and "Retired Command Fields" tables in Part 07. Originally, this was written out as two separate +# dictionaries in one named dictionary. I decided to simplify this to one single dictionary +# with no name, since I don't need the elements in the 2nd dict. +# K. Helmer +# Massachusetts General Hospital, 2018 + +import urllib2 +import xml.etree.ElementTree as ET +import os + +# pydict_filename = '../dicom/_dicom_dict.py' #this is the filename format expected for pydicom codebase +pydict_filename = 'dicom_dict_vr.dict' # KGH +main_dict_name = 'DicomDictionary' #KGH - not used; only want dict in file, not "name = " +mask_dict_name = 'RepeatersDictionary' + +def write_dict(f, dict_name, attributes, tagIsString): #KGH-write out the tag as a string in both cases + if tagIsString: + #entry_format = """'{Tag}': ('{VR}', '{VM}', '{Name}', '{Retired}', '{Keyword}')""" + entry_format = """"{Tag}": ("{VR}", "{VM}", "{Name}", "{Retired}", "{Keyword}")""" #KGH - try double quotes because some Names have apostrophe's in them, e.g., "Referring Physician's Name" + else: + #entry_format = """{Tag}: ('{VR}', '{VM}', '{Name}', '{Retired}', '{Keyword}')""" #original + #entry_format = """'{Tag}': ('{VR}', '{VM}', '{Name}', '{Retired}', '{Keyword}')""" #KGH - make tag a string + entry_format = """"{Tag}": ("{VR}", "{VM}", "{Name}", "{Retired}", "{Keyword}")""" #KGH - try double quotes because some Names have apostrophe's in them, e.g., "Referring Physician's Name" + + #f.write("\n%s = {\n " % dict_name) + #f.write("%s = {\n " % dict_name) #KGH - no initial newline necessary + don't want "name = {}" + f.write("{\n ") #KGH - just start with dict "{" + f.write(",\n ".join(entry_format.format(**attr) for attr in attributes)) + f.write("\n}\n") + + +def parse_docbook_table(book_root, caption, empty_field_name="Retired"): + """ Parses the given XML book_root for the table with caption matching caption for DICOM Element data + Returns a list of dicts with each dict representing the data for an Element from the table + """ + + br = '{http://docbook.org/ns/docbook}' # Shorthand variable for book_root + + # Find the table in book_root with caption + for table in book_root.iter('%stable' %br): + if table.find('%scaption' %br).text == caption: + + def parse_header(header_row): + """ Parses the table's thead/tr row, header_row, for the column headers """ + field_names = [] + + # The header_row should be ... + # Which leaves the following: + # Header 1 + # Header 2 + # etc... + # Note that for the part06 tables the last col header (Retired) is: + # + for x in header_row.iter('%sth' %br): + # If there is an emphasis tag under the para tag then its text is the column header + if x.find('%spara' %br).find('%semphasis' %br) is not None: + col_label = x.find('%spara' %br).find('%semphasis' %br).text + field_names.append(col_label) + + # If there isn't an emphasis tag under the para tag then it must be the Retired header + else: + field_names.append("Retired") + + return field_names + + # Get the column headers + field_names = parse_header(table.find('%sthead' %br).find('%str' %br)) + + def parse_row(field_names, row): + """ Parses the table's tbody tr row, row, for the DICOM Element data + Returns a list of dicts {header1 : val1, header2 : val2, ...} with each list an Element + """ + + cell_values = [] + + # The row should be ... + # Which leaves the following: + # Value 1 + # Value 2 + # etc... + # Some rows are + # Value 1 + # Value 2 + # etc... + # There are also some without text values + # + # + + for cell in row.iter('%spara' %br): + # If we have an emphasis tag under the para tag + emph_value = cell.find('%semphasis' %br) + if emph_value is not None: + # If there is a text value add it, otherwise add "" + if emph_value.text is not None: + cell_values.append(emph_value.text.strip().replace(u"\u200b", "")) #200b is a zero width space + else: + cell_values.append("") + # Otherwise just grab the para tag text + else: + if cell.text is not None: + cell_values.append(cell.text.strip().replace(u"\u200b", "")) + else: + cell_values.append("") + + return {key : value for key, value in zip(field_names, cell_values)} + + # Get all the Element data from the table + attrs = [parse_row(field_names, row) for row in table.find('%stbody' %br).iter('%str' %br)] + return attrs + +attrs = [] + +# KGH - first look in Part 06 for three specific tables (see attrs += statements for table names) +#url = 'http://medical.nema.org/medical/dicom/current/source/docbook/part06/part06.xml' +#response = urllib2.urlopen(url) +fLoc = '/home/karl/Work/INCF/DICOM_docbook_latest/source/docbook/part06/part06.xml' #KGH +response = open(fLoc) #KGH +tree = ET.parse(response) +root = tree.getroot() +response.close() # KGH + +attrs += parse_docbook_table(root, "Registry of DICOM Data Elements") +attrs += parse_docbook_table(root, "Registry of DICOM File Meta Elements") +attrs += parse_docbook_table(root, "Registry of DICOM Directory Structuring Elements") +#KGH --------------------------------------------------------------- + +#KGH - Then look at Part 07 that has the command field tables +fLoc = '/home/karl/Work/INCF/DICOM_docbook_latest/source/docbook/part07/part07.xml' #KGH +response = open(fLoc) #KGH +#url = 'http://medical.nema.org/medical/dicom/current/source/docbook/part07/part07.xml' +#response = urllib2.urlopen(url) +tree = ET.parse(response) +root = tree.getroot() + +command_attrs = parse_docbook_table(root, "Command Fields") # Changed from 2013 standard +for attr in command_attrs: + attr["Name"] = attr["Message Field"] + attr["Retired"] = "" + +retired_command_attrs = parse_docbook_table(root, "Retired Command Fields") +for attr in retired_command_attrs: + attr["Name"] = attr["Message Field"] + attr["Retired"] = "Retired" + +attrs += command_attrs +attrs += retired_command_attrs +#KGH ------------------------------------------------------------------------------- + + +# KGH - attrs dict now populated; sort by tag value +attrs = sorted(attrs, key=lambda x: x["Tag"]) + +main_attributes = [] +mask_attributes = [] + +#KGH -check to see format of attrs key-value pair +#print attrs[0]["Description of Field"] + +for attr in attrs: + group, elem = attr['Tag'][1:-1].split(",") + + #KGH - unused as tables in Part 06 doesn't include definitions in tables + #KGH check to see if Description of Field exists; if not create key and make value a blank string + #if 'Description of Field' in attr: + # pass + #else: + # attr['Description of Field'] = 'None' + + # e.g. (FFFE,E000) + if attr['VR'] == 'See Note': + attr['VR'] = 'NONE' + + # e.g. (0018,1153), (0018,8150) and (0018,8151) + attr["Name"] = attr["Name"].replace(u"ยต", "u") # replace micro symbol + + # e.g. (0014,0023) and (0018,9445) + if attr['Retired'] in ['RET', 'RET - See Note']: + attr['Retired'] = 'Retired' + + # e.g. (0008,0102), (0014,0025), (0040, A170) + if attr['Retired'] in ['DICOS', 'DICONDE', 'See Note']: + attr['Retired'] = '' + + # e.g. (0028,1200) + attr['VM'] = attr['VM'].replace(" or ", " ") + + # If blank then add dummy vals + # e.g. (0018,9445) and (0028,0020) + if attr['VR'] == '' and attr['VM'] == '': + attr['VR'] = 'OB' + attr['VM'] = '1' + attr['Name'] = 'Retired-blank' + + # handle retired 'repeating group' tags + # e.g. (50xx,eeee) or (gggg,31xx) + if 'x' in group or 'x' in elem: + attr["Tag"] = group + elem + mask_attributes.append(attr) + else: + #attr["Tag"] = '0x%s%s' %(group, elem) + attr["Tag"] = '%s%s' %(group, elem) #KGH - writing out as string; don't need 32-bit value + main_attributes.append(attr) + +py_file = file(pydict_filename, "wb") +#KGH - the following 3 write lines are for pydicom only and not needed for NIDM +#py_file.write("# %s\n" % os.path.basename(pydict_filename)) +#py_file.write('"""DICOM data dictionary auto-generated by %s"""\n' % os.path.basename(__file__)) +#py_file.write('from __future__ import absolute_import\n') +write_dict(py_file, main_dict_name, main_attributes, tagIsString=False) +#write_dict(py_file, mask_dict_name, mask_attributes, tagIsString=True) + +py_file.close() + +print ("Finished creating python file %s containing the dicom dictionary" % pydict_filename) +print ("Wrote %d tags" % (len(main_attributes) + len(mask_attributes)))