Skip to content
Snippets Groups Projects

Resolve "Lacking sanity checks to iricdata connection"

Merged Albert Feghaly requested to merge 21-lacking-sanity-checks-to-iricdata-connection into master
+ 136
66
@@ -42,12 +42,15 @@ class Client:
if self.token is not None:
if self.pwd is not None:
sys.stderr.write('WARNING: Ambiguous authentification, ignoring ' +
'password in favor of PAT\n')
sys.stderr.write(
'WARNING: Amiguous authentification, ' +
'ignoring password in favor of PAT\n'
)
self.token = self.token.strip()
session.headers.update({'Iric-Auth-Token': F'{self.token}'})
elif self.user and self.pwd:
elif self.user is not None and self.pwd is not None:
login_url = os.path.join(self.url, 'login/')
session.get(login_url)
csrftoken = session.cookies['csrftoken']
@@ -61,56 +64,107 @@ class Client:
login = session.post(login_url, cookies=cookies, data=payload,
headers=headers)
if login.status_code == 200:
if any(x in login.text for x in ['Erreur', 'Error']):
raise IricDataConnectionError('Connexion failed -- verify ' +
'your username and password')
else:
sys.stdout.write('Your connexion to IRIC-Data has been ' +
'established [user=%s]\n' % self.user)
else:
raise IricDataConnectionError('Could not initiate connexion ' +
'with IRIC-Data')
if login.status_code != 200:
raise IricDataConnectionError(
'Could not initiate connexion with IRIC-Data.'
)
# ==> A cleaner way would be to verify route accesses directly
# as done for PAT-based access
#elif any(x in login.text for x in ['Erreur', 'Error']):
# raise IricDataConnectionError(
# '[Connexion failed] Please verify ' +
# 'your username and password.'
# )
else:
raise IricDataConnectionError('Connexion failed -- please verify ' +
'that you submitted a token or a ' +
'username/password combination')
raise IricDataConnectionError(
'[Connexion failed] Missing PAT or ' +
'username/password combination.'
)
self.session = session
self.datasets = self.get_available_datasets()
sys.stdout.write(
'Your connexion to IRIC-Data has been ' +
'established [user=%s]\n' % self.user
)
try:
self.labs = self.get_available_labs()
except:
self.labs = None
print('get_available_labs() method is not implemented in api/v1 of IRIC-Data')
sys.stderr.write(
'get_available_labs() method is not ' +
'implemented in api/v1 of IRIC-Data\n'
)
try:
self.datafiles = self.get_available_datafiles()
except:
self.datafiles = None
sys.stderr.write(
'get_available_datafiles() method is not ' +
'implemented in api/v1 of IRIC-Data\n'
)
def get_response(self, url, **kwargs):
"""Manage GET requests to IRIC-Data
:param url: IRIC-Data view URL.
:param **kwargs: Optional arguments for request.session().get().
"""
r = self.session.get(url, **kwargs)
if r.status_code == 200:
return r
elif r.status_code == 401:
raise IricDataConnectionError(
'[Identification failed] ' +
'Please double-check your identification information.'
)
elif r.status_code == 404:
sys.stderr.write(
'[Not found] ' +
'This request leads to non-existent data.\n'
)
return None
else:
raise IricDataConnectionError(
'[Error %d] ' % r.status_code +
'Cannot continue with your request.'
)
self.datasets = self.get_available_datasets()
#self.datafiles = self.get_available_datafiles()
""" Return DataFrame with Name and ID for available labs """
def get_available_labs(self):
r = self.session.get(os.path.join(
self.url, 'secure/datafiles/annotate'
))
endpoint_url = os.path.join(self.url, 'secure/datafiles/annotate')
r = self.get_response(endpoint_url)
soup = BeautifulSoup(r.text, 'html.parser')
options = soup.find(id='id_lab').find_all('option')
df = pd.DataFrame({
'lab_id': [x['value'] for x in options],
'lab_name': [x.contents[0] for x in options]})
if ''.join(list(set(df.loc[0].lab_name))) == '-':
df.drop(0, axis=0, inplace=True)
df.index = df.lab_id.copy()
df.index.name = 'ID'
return df
""" Fetch Dataset Name and Slug ID available to the user as JSON """
def get_available_datasets(self):
r = self.session.get(
os.path.join(self.url, self.api_my_datasets)
).json()
endpoint_url = os.path.join(self.url, self.api_my_datasets)
r = self.get_response(endpoint_url).json()
df = pd.DataFrame(r['data'], columns=['dataset_name', 'dataset_slug'])
df.index = df.dataset_slug.copy()
df.index.name = 'ID'
return df
""" Return a DataFrame of user's files. A filter can be made on annotation
@@ -124,7 +178,7 @@ class Client:
return(None)
elif key_anno is not None and value_anno is not None:
if dataset_id is None:
r = self.session.get(
r = self.get_response(
os.path.join(
self.url,
self.api_datafiles_key_value,
@@ -132,7 +186,7 @@ class Client:
)
).json()
else:
r = self.session.get(
r = self.get_response(
os.path.join(
self.url,
self.api_datafiles_key_value,
@@ -141,7 +195,7 @@ class Client:
).json()
elif key_anno is not None:
if dataset_id is None:
r = self.session.get(
r = self.get_response(
os.path.join(
self.url,
self.api_datafiles_key,
@@ -149,7 +203,7 @@ class Client:
)
).json()
else:
r = self.session.get(
r = self.get_response(
os.path.join(
self.url,
self.api_datafiles_key,
@@ -158,7 +212,7 @@ class Client:
).json()
elif value_anno is not None:
if dataset_id is None:
r = self.session.get(
r = self.get_response(
os.path.join(
self.url,
self.api_datafiles_value,
@@ -166,7 +220,7 @@ class Client:
)
).json()
else:
r = self.session.get(
r = self.get_response(
os.path.join(
self.url,
self.api_datafiles_value,
@@ -174,7 +228,7 @@ class Client:
)
).json()
elif dataset_id is not None:
r = self.session.get(
r = self.get_response(
os.path.join(
self.url,
self.api_datafiles_dataset,
@@ -184,6 +238,7 @@ class Client:
# id is internal to iric-data
df = pd.DataFrame(r['data']).rename({'id': 'numerical_id'}, axis=1)
if 'iric_data_id' in df.columns:
df.index = df.iric_data_id
df.index.name = 'ID'
@@ -196,41 +251,41 @@ class Client:
""" Return metadata JSON for a given file_id """
def get_file_metadata(self, file_id):
metadata = self.session.get(os.path.join(
endpoint_url = os.path.join(
self.url, self.api_datafiles_meta, str(file_id)
))
try:
)
metadata = self.get_response(endpoint_url)
if metadata is not None:
metadata = metadata.json()
# TODO except something:
except:
sys.stderr.write('ERROR: File %s does not exist in ' +
'database\n' % str(file_id))
metadata = None
return metadata
""" Return annotations JSON for a given file_id """
def get_file_annotation(self, file_id):
annotation = self.session.get(os.path.join(
self.url, self.api_datafiles_annotation, str(file_id)
))
try:
endpoint_url = os.path.join(self.url,
self.api_datafiles_annotation, str(file_id))
annotation = self.get_response(endpoint_url)
if annotation is not None:
annotation = annotation.json()
# TODO except something:
except:
sys.stderr.write('ERROR: File %s does not exist in ' +
'database\n' % str(file_id))
annotation = None
return annotation
def get_file_content(self, file_id):
"""Returns datafile content for a given file_id"""
conn = self.get_file_data_conn(file_id)
if conn is not None:
return conn.content
else:
return None
""" Return a connector of the contents of a file for a given file_id """
def get_file_data_conn(self, file_id):
path = os.path.join(self.url, 'secure/datafiles/download', str(file_id))
try:
content = self.session.get(path, allow_redirects=True)
# TODO except something:
except:
sys.stderr.write('ERROR: File %s does not exist in database\n' % str(file_id))
content = None
dwnl_url = os.path.join(self.url,
'secure/datafiles/download', str(file_id))
content = self.get_response(dwnl_url)
return content
""" Return DataFrame (file_name, file_id, file_slug, file_hash) for
@@ -243,31 +298,39 @@ class Client:
""" Get a subset of the available datasets for which there is a match """
def filter_datasets(self, term, exact_match=False):
df = self.datasets
if exact_match:
return self.datasets[self.datasets.dataset_name.str.fullmatch(term)]
return df[df.dataset_name.str.fullmatch(term)]
else:
return self.datasets[self.datasets.dataset_name.str.contains(term)]
return df[df.dataset_name.str.contains(term)]
""" Get a subset of the available datafiles for which there is a match,
kwargs refer to get_datafiles_list arguments
"""
def filter_datafiles(self, term, field='filename', exact_match=False, **kwargs):
df = self.get_datafiles_list(**kwargs)
if exact_match:
return df[df[field].str.fullmatch(term)]
if df is None:
sys.stderr.write("Please try again with more arguments, for more details please see function get_datafiles_list()")
return None
elif df.empty:
return None
else:
return df[df[field].str.contains(term)]
if exact_match:
return df[df[field].str.fullmatch(term)]
else:
return df[df[field].str.contains(term)]
""" Get file content according to file_id """
def get_file(self, file_id):
try:
file_metadata = self.get_file_metadata(file_id)
file_content = self.get_file_data_conn(file_id).content
file_content = self.get_file_content(file_id)
file_annotation = self.get_file_annotation(file_id)
return IDF(file_metadata, file_content, file_annotation)
except TypeError:
return IDF(None, None, None)
""" Download file according to file_id """
def dwnl_file_content(self, file_id, folder_out=None, filename=None):
file_meta = self.get_file_metadata(file_id)
@@ -277,9 +340,10 @@ class Client:
folder_out = os.path.join(os.getcwd(), folder_out)
else:
folder_out = os.getcwd()
file_conn = self.get_file_data_conn(file_id)
filename = file_meta['filename'] if filename is None else filename
out_file_path = os.path.join(folder_out, filename)
file_conn = self.get_file_data_conn(file_id)
assert file_conn is not None # should never be None here
if os.path.exists(out_file_path):
sys.stderr.write('Warning: File already exists at ' +
'location %s, skipping.\n' % out_file_path)
@@ -298,9 +362,9 @@ class Client:
folder_out = os.path.join(os.getcwd(), folder_out)
else:
folder_out = os.getcwd()
annotations = self.get_file_annotation(file_id)
filename = file_meta['filename'] if filename is None else filename
out_file_path = os.path.join(folder_out, filename + '.json')
annotations = self.get_file_annotation(file_id)
if os.path.exists(out_file_path):
sys.stderr.write('Warning: File already exists at ' +
'location %s, skipping.\n' % out_file_path)
@@ -309,11 +373,17 @@ class Client:
with open(out_file_path, 'w') as outfile:
json.dump(annotations, outfile)
""" Download an entire dataset """
def dwnl_dataset(self, dataset_id, folder_out=None, datasetname=None):
dataset = self.get_dataset_filelist(dataset_id)
datasetname = self.datasets.loc[dataset_id].dataset_name if datasetname is None else datasetname
for file_id in np.unique(dataset.file_id):
if folder_out:
if folder_out[0] != '/':
folder_out = os.path.join(os.getcwd(), folder_out)
else:
folder_out = os.getcwd()
for file_id in np.unique(dataset.iric_data_id):
self.dwnl_file_content(
file_id, os.path.join(folder_out, datasetname)
)
@@ -486,7 +556,7 @@ class Client:
file_id = str(file_id)
update_url = os.path.join(self.url, 'secure/datafiles/update', file_id)
r = self.session.get(update_url)
r = self.get_response(update_url)
soup = BeautifulSoup(r.text, 'html.parser')
current_filename = soup.find(id='id_filename')['value']
current_lab = soup.find(id='id_lab').find_all('option', selected=True)[0]['value']
Loading