Commit 8b288811 authored by Benjamin Herfort's avatar Benjamin Herfort
Browse files

Merge branch 'sven_cleanDateFormat' into 'master'

fixed different date format problem

See merge request !14
parents 7d699c1d 63d229f2
......@@ -66,7 +66,7 @@ def get_df_from_gsheet(credentials, doc_id: str, sheet_name: str) -> pd.DataFram
def upload_df_to_gsheet(
credentials, doc_id: str, sheet_name: str, df: pd.DataFrame
credentials, doc_id: str, sheet_name: str, df: pd.DataFrame
) -> None:
"""Upload data to google spreadsheet."""
......@@ -77,7 +77,7 @@ def upload_df_to_gsheet(
def addNoiseToDuplicatedCoords(df):
# are the duplicated coordinates?
noiseFactor = 10 ** -2
# round X and Y to ensure that close locations are considered as dubplicates as well
# round X and Y to ensure that close locations are considered as duplicates as well
df["Xround"] = df["X"].round(2)
df["Yround"] = df["Y"].round(2)
isDuplicate = df.duplicated(
......@@ -163,15 +163,34 @@ def run():
# TODO: this is currently broken! You need to provide as csv file manually.
new_data_df = get_trials_df_from_url()
# remove wrong case NCT04256395
studies_to_delete = ["NCT04256395", "NCT04061382", "NCT04226157", "NCT03331445", "NCT03680274", "ISRCTN51287266"]
studies_to_delete = ["NCT04256395", "NCT04226157", "ISRCTN51287266", "NCT04331860"]
for study in studies_to_delete:
new_data_df = new_data_df.drop(new_data_df[new_data_df['TrialID'] == study].index, axis=0)
# for some studies that have been running for a while and have now been updated to incorporate COVID-19
# as well the suggestions by Markus Reis and Konstantin is to use the date in "Last Update Post" instead
studies_updated = ["EUCTR2015-002340-14-NL", "NCT03331445", "NCT04061382", "NCT03680274", "NCT03808922"]
needsUpdate = new_data_df['TrialID'].isin(studies_updated)
new_data_df["Date registration"] = np.where(needsUpdate,
new_data_df["Last Refreshed on"],
new_data_df["Date registration"])
isSlashDateFormat = new_data_df["Date registration"].str.find("/") == 2
def changeDateFormat(aStr):
if (aStr.find("/")==2):
theParts = aStr.split("/")
return(theParts[2] + "-" + theParts[1] + "-"+ theParts[0])
return (aStr)
new_data_df["Date registration"] = new_data_df["Date registration"].apply(changeDateFormat)
new_data_df["Date registration"] = pd.to_datetime(new_data_df["Date registration"])
new_data_df["Date registration"] = new_data_df["Date registration"].apply(
lambda x: x.strftime("%Y-%m-%d")
# get categories from "old" trials data in gsheet
sheet_name = "trial_categories"
categories_df = get_df_from_gsheet(credentials, doc_id, sheet_name)
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment