Commit 63d229f2 authored by Sven Lautenbach's avatar Sven Lautenbach
Browse files

fixed different date format problem

parent 7d699c1d
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -66,7 +66,7 @@ def get_df_from_gsheet(credentials, doc_id: str, sheet_name: str) -> pd.DataFram
def upload_df_to_gsheet(
credentials, doc_id: str, sheet_name: str, df: pd.DataFrame
credentials, doc_id: str, sheet_name: str, df: pd.DataFrame
) -> None:
"""Upload data to google spreadsheet."""
d2g.upload(
......@@ -77,7 +77,7 @@ def upload_df_to_gsheet(
def addNoiseToDuplicatedCoords(df):
# are the duplicated coordinates?
noiseFactor = 10 ** -2
# round X and Y to ensure that close locations are considered as dubplicates as well
# round X and Y to ensure that close locations are considered as duplicates as well
df["Xround"] = df["X"].round(2)
df["Yround"] = df["Y"].round(2)
isDuplicate = df.duplicated(
......@@ -163,15 +163,34 @@ def run():
# TODO: this is currently broken! You need to provide as csv file manually.
new_data_df = get_trials_df_from_url()
# remove wrong case NCT04256395
studies_to_delete = ["NCT04256395", "NCT04061382", "NCT04226157", "NCT03331445", "NCT03680274", "ISRCTN51287266"]
studies_to_delete = ["NCT04256395", "NCT04226157", "ISRCTN51287266", "NCT04331860"]
for study in studies_to_delete:
new_data_df = new_data_df.drop(new_data_df[new_data_df['TrialID'] == study].index, axis=0)
# for some studies that have been running for a while and have now been updated to incorporate COVID-19
# as well the suggestions by Markus Reis and Konstantin is to use the date in "Last Update Post" instead
studies_updated = ["EUCTR2015-002340-14-NL", "NCT03331445", "NCT04061382", "NCT03680274", "NCT03808922"]
needsUpdate = new_data_df['TrialID'].isin(studies_updated)
new_data_df["Date registration"] = np.where(needsUpdate,
new_data_df["Last Refreshed on"],
new_data_df["Date registration"])
isSlashDateFormat = new_data_df["Date registration"].str.find("/") == 2
def changeDateFormat(aStr):
if (aStr.find("/")==2):
theParts = aStr.split("/")
return(theParts[2] + "-" + theParts[1] + "-"+ theParts[0])
else:
return (aStr)
new_data_df["Date registration"] = new_data_df["Date registration"].apply(changeDateFormat)
new_data_df["Date registration"] = pd.to_datetime(new_data_df["Date registration"])
new_data_df["Date registration"] = new_data_df["Date registration"].apply(
lambda x: x.strftime("%Y-%m-%d")
)
# get categories from "old" trials data in gsheet
sheet_name = "trial_categories"
categories_df = get_df_from_gsheet(credentials, doc_id, sheet_name)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment