-
Notifications
You must be signed in to change notification settings - Fork 15
/
cancer_data.py
29 lines (26 loc) · 1.37 KB
/
cancer_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from vec import Vec
from matutil import rowdict2mat
def read_training_data(fname, D=None):
"""Given a file in appropriate format, and given a set D of features,
returns the pair (A, b) consisting of
a P-by-D matrix A and a P-vector b,
where P is a set of patient identification integers (IDs).
For each patient ID p,
- row p of A is the D-vector describing patient p's tissue sample,
- entry p of b is +1 if patient p's tissue is malignant, and -1 if it is benign.
The set D of features must be a subset of the features in the data (see text).
"""
file = open(fname)
params = ["radius", "texture", "perimeter","area","smoothness","compactness","concavity","concave points","symmetry","fractal dimension"];
stats = ["(mean)", "(stderr)", "(worst)"]
feature_labels = set([y+x for x in stats for y in params])
feature_map = {params[i]+stats[j]:j*len(params)+i for i in range(len(params)) for j in range(len(stats))}
if D is None: D = feature_labels
feature_vectors = {}
patient_diagnoses = {}
for line in file:
row = line.split(",")
patient_ID = int(row[0])
patient_diagnoses[patient_ID] = -1 if row[1]=='B' else +1
feature_vectors[patient_ID] = Vec(D, {f:float(row[feature_map[f]+2]) for f in D})
return rowdict2mat(feature_vectors), Vec(set(patient_diagnoses.keys()), patient_diagnoses)