From 922c8e1eebc28b2254360ddb8e8285c8f366d14b Mon Sep 17 00:00:00 2001 From: stephen520254 <81504133+stephen520254@users.noreply.github.com> Date: Wed, 31 Jan 2024 13:19:00 +0800 Subject: [PATCH] Add files via upload --- orderList_v4_beta_1nov_rebecca.ipynb | 1039 ++++++++++++++++++++++++++ 1 file changed, 1039 insertions(+) create mode 100644 orderList_v4_beta_1nov_rebecca.ipynb diff --git a/orderList_v4_beta_1nov_rebecca.ipynb b/orderList_v4_beta_1nov_rebecca.ipynb new file mode 100644 index 0000000..f6b65f0 --- /dev/null +++ b/orderList_v4_beta_1nov_rebecca.ipynb @@ -0,0 +1,1039 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "12a493cf", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import re\n", + "from datetime import datetime as dt\n", + "pd.set_option('display.max_rows', None)\n", + "import tabula\n", + "from tabula.io import read_pdf\n", + "import os\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "beeebc12", + "metadata": {}, + "outputs": [], + "source": [ + "# # rebecca\n", + "\n", + "# path = r'C:\\Users\\rebecca.lee\\OneDrive - GC International AG\\Rebecca Lee\\orderListData\\input_data\\\\'\n", + "\n", + "# path_output = r'C:\\Users\\rebecca.lee\\OneDrive - GC International AG\\Rebecca Lee\\orderListData\\output_data\\\\'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "939a7f99", + "metadata": {}, + "outputs": [], + "source": [ + "# stephen\n", + "\n", + "path_output = r'C:\\Users\\stephenl\\OneDrive - GC International AG\\ML\\Trial\\data_output\\\\'\n", + "\n", + "path = r'C:\\Users\\stephenl\\OneDrive - GC International AG\\ML\\Trial\\data_input\\\\'" + ] + }, + { + "cell_type": "markdown", + "id": "dfbb97f6", + "metadata": {}, + "source": [ + "# Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c0907986", + "metadata": {}, + "outputs": [], + "source": [ + "# separate product code and product packaging\n", + "def splitnJoin(x):\n", + " remove0 = lambda x: x.split(' ')[0]\n", + " remove1 = lambda x: x.split(' ')[1:]\n", + " res1 = remove0(x)\n", + " _=remove1(x)\n", + " res2 = ' '.join(_)\n", + " return res1, res2\n", + "\n", + "nandate = lambda x: dt.now() if x is null() else x\n", + "\n", + "# function to extract PO number\n", + "def extractPo(df_):\n", + " po = df_[0]\n", + " po=po.fillna('nann')\n", + " po_total = []\n", + " for p in po.iloc[:,3]:\n", + " if re.search(r'^[A-Z]\\d{3}\\s',p) or re.search(r'^[A-Z]\\d{3}',p) or re.search(r'^[0-9]*$',p):\n", + " p=p.replace('/','_').replace('-','_')\n", + " po_total.append(p)\n", + " if len(po_total)>0:\n", + " return ''.join(po_total)\n", + " else:\n", + " return 'no po number available'\n", + "\n", + "\n", + "\n", + "# extract expiry month and year\n", + "def toMthYr(s):\n", + " if re.search('\\-[A-Z]\\d{4}$',s):\n", + " return dt.strptime(s[-4:][0:2]+'-'+s[-4:][2:4],\"%m-%y\")\n", + " else:\n", + " return dt.now()\n", + "\n", + "\n", + "# calculate date to expiry\n", + "def month_diff(x,y):\n", + " end = x.dt.to_period('M').view(dtype='int64')\n", + " start = y.dt.to_period('M').view(dtype='int64')\n", + " return end - start\n", + "\n", + "def otherCharges(df_):\n", + " df=df_[-1]\n", + " \n", + " ship_instr = ship(df_)\n", + " df_new = pd.DataFrame(columns=['material_no','material_desc','batch','qty','unit_listprice','unit_discprice',\\\n", + " 'list_pricetotal','disc_pricetotal','po_num','exp_date','num_months'])\n", + " \n", + " df_new.loc[0,'exp_date']='Shipping Instructions'\n", + " df_new.loc[0,'num_months']=ship_instr\n", + " if 'Freight Charge' in df.iloc[:,0].tolist():\n", + " df_new.loc[1,'num_months']=df[df.iloc[:,0]=='Freight Charge'].Discount.iloc[0]\n", + " df_new.loc[1,'exp_date']='Freight Charge'\n", + " if 'Insurance Charge' in df.iloc[:,0].tolist():\n", + " df_new.loc[2,'num_months']=df[df.iloc[:,0]=='Insurance Charge'].Discount.iloc[0]\n", + " df_new.loc[2,'exp_date']='Insurance Charge'\n", + " if 'Packing Charge' in df.iloc[:,0].tolist():\n", + " df_new.loc[3,'num_months']=df[df.iloc[:,0]=='Packing Charge'].Discount.iloc[0]\n", + " df_new.loc[3,'exp_date']='Packing Charge'\n", + " if 'Total Excluding GST' in df.iloc[:,4].tolist():\n", + " df_new.loc[4,'num_months']=df[df.iloc[:,4]=='Total Excluding GST'].Discount.iloc[0]\n", + " df_new.loc[4,'exp_date']='Total Excluding GST'\n", + " if 'Other Charges' in df.iloc[:,4].tolist():\n", + " df_new.loc[5,'num_months']=df[df.iloc[:,4]=='Other Charges'].Discount.iloc[0]\n", + " df_new.loc[5,'exp_date']='Other Charges'\n", + " if 'Add GST @ 0%' in df.iloc[:,4].tolist():\n", + " df_new.loc[6,'num_months']=df[df.iloc[:,4]=='Add GST @ 0%'].Discount.iloc[0]\n", + " df_new.loc[6,'exp_date']='Add GST @ 0%'\n", + " if 'Add GST @ 7%' in df.iloc[:,4].tolist():\n", + " df_new.loc[6,'num_months']=df[df.iloc[:,4]=='Add GST @ 7%'].Discount.iloc[0]\n", + " df_new.loc[6,'exp_date']='Add GST @ 7%'\n", + " if 'Invoice Total Including GST' in df.iloc[:,4].tolist():\n", + " df_new.loc[7,'num_months']=df[df.iloc[:,4]=='Invoice Total Including GST'].Discount.iloc[0]\n", + " df_new.loc[7,'exp_date']='Invoice Total Including GST'\n", + " \n", + " for idx, i in zip(df_new.index, df_new.num_months):\n", + " try:\n", + " df_new.loc[idx,'num_months']=float(i)\n", + " except:\n", + " continue\n", + " \n", + " return df_new\n", + "\n", + "\n", + "def regsplit(m):\n", + " trim = lambda x: ' '.join(x.split())\n", + " g = re.search('.*\\-\\d{3,4}[A-Z]',m) or re.search('^\\d{6}',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}',m)\\\n", + " or re.search('^[A-Z]\\d{8}',m) or re.search('.*\\-\\d{3,4}',m)\n", + " g = g.group()\n", + " g2 = m.replace(g,'')\n", + " return trim(g),trim(g2)\n", + "\n", + "# returns only\n", + "def regsplit2(m):\n", + " trim = lambda x: ' '.join(x.split())\n", + " g = re.search('.*\\-\\d{3,4}[A-Z]',m) or re.search('^.*\\d{6}',m) or re.search('^.*[A-Z]{2}\\d{2}\\-[A-Z]{2}',m)\\\n", + " or re.search('^.*[A-Z]\\d{8}',m) or re.search('.*\\-\\d{3,4}',m)\n", + " g = g.group()\n", + " return trim(g)\n", + "\n", + "def check_space(string):\n", + " \n", + " # counter\n", + " count = 0\n", + " \n", + " # loop for search each index\n", + " for i in range(0, len(string)):\n", + " \n", + " # Check each char\n", + " # is blank or not\n", + " if string[i] == \" \":\n", + " count += 1\n", + " \n", + " return count\n", + "\n", + "# function to return shipping instructions\n", + "def ship(df_):\n", + " si=df_[0]\n", + " si=si.dropna(subset=['Unnamed: 2'])\n", + " if 'Shipping Instructions:' in si['Unnamed: 2'].tolist():\n", + " idx = si[si['Unnamed: 2']=='Shipping Instructions:'].index[0]\n", + " si_list = si.loc[idx+1:,'Unnamed: 2']\n", + " si_list1 = si_list.tolist()\n", + " si_list2 = ','.join(si_list1)\n", + " else:\n", + " si_list2 = 'No Shipping Instructions Available'\n", + " return si_list2\n", + "\n", + "\n", + "def NPpcodesPI(i): \n", + " return pi_np[pi_np['material_no']==i]\n", + "\n", + "def NPpcodesOA(i):\n", + " return oa_np[oa_np['material_no']==i]\n", + "\n", + "def PpcodesPI(i): \n", + " return pi_p[pi_p['material_no']==i]\n", + "\n", + "def PpcodesOA(i):\n", + " return oa_p[oa_p['material_no']==i]\n", + "\n", + "def expiry():\n", + " a = temp_under16\n", + " poNum = a.po_numT.unique()\n", + " numPdts = a.qty.sum()\n", + " numPCodes = len(a.material_no.unique())\n", + " return poNum, numPdts, numPCodes\n", + "\n", + "def numPo():\n", + " a = all_pi_df\n", + " pos = a['customer_purchase_order_#_T'].unique()\n", + " numPos = len(a['customer_purchase_order_#_T'].unique())\n", + " return pos, numPos\n", + "\n", + "# extract 2 columns \n", + "def trimm(i):\n", + " return i[['material_no','po_numT']]\n", + "\n", + "# convert series to list\n", + "def listit(seri):\n", + " pc = []\n", + " for w in seri:\n", + " pc.append(w)\n", + " return pc\n", + "\n", + "\n", + "# # final status output for reconciliation\n", + "# def NPstatus():\n", + "# a = NP_status[NP_status.status=='Discrepancies in OA']\n", + "# mat = a.material_no\n", + "# po = a.po_numT.unique()\n", + "# req = (NP_status.status=='Missing code in OA') | (NP_status.status=='Missing code in PI')\n", + "# b = NP_status[req]\n", + "# pc = b.material_no\n", + "# pom = b.po_numT.unique()\n", + "# return listit(po),listit(mat), listit(pc),listit(pom)\n", + "\n", + "\n", + "# final status output for reconciliation\n", + "def NPstatus():\n", + " a = NP_status[NP_status.status=='Discrepancies in OA']\n", + " mat = a.material_no.unique()\n", + " po = a.po_numT.unique()\n", + " req = (NP_status.status=='Missing code in OA') | (NP_status.status=='Missing code in PI')\n", + " b = NP_status[req]\n", + " pc = b.material_no.unique()\n", + " pom = b.po_numT.unique()\n", + " return listit(po),listit(mat), listit(pc),listit(pom)\n", + "\n", + "\n", + "def Pstatus():\n", + " a = P_status[P_status.status=='Discrepancies in OA'] \n", + " mat = a.material_no.unique()\n", + " po = a.po_numT.unique()\n", + " req = (P_status.status=='Missing code in OA') | (P_status.status=='Missing code in PI')\n", + " b = P_status[req]\n", + " pc = b.material_no.unique()\n", + " pom = b.po_numT.unique()\n", + " return listit(po),listit(mat), listit(pc),listit(pom)\n", + "\n", + "def overallPO():\n", + " a = all_pi_df\n", + " pc = len(all_pi_df['customer_purchase_order_#'].unique())\n", + " return pc\n", + "\n", + "missingtext = lambda x: 'Missing code in PI' if x=='Discrepancies in OA' else 'Missing code in OA' if x=='Correct items in PI' else \\\n", + "'NA'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "562244d3", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "def execute(df_):\n", + " \n", + " # extract customer po & shipping instructions\n", + "\n", + " po_num = extractPo(df_)\n", + "\n", + " # create dataframe for packing charge etc\n", + " others = otherCharges(df_)\n", + " \n", + " \n", + " # create dataframes of all relevant rows\n", + "\n", + " # length of dataframe (raw)\n", + " leng = len(df_)\n", + "\n", + " # create new dataframe\n", + " temp = pd.DataFrame()\n", + "\n", + " # concat dataframes with 9 columns which have product codes\n", + "\n", + " for i in range(0, leng):\n", + " \n", + " if df_[i].shape[1]==9:\n", + " temp=pd.concat([temp,df_[i]])\n", + "\n", + " elif df_[i].shape[1]>9:\n", + " with open(path_output+f\"Alert_{po_num}_.txt\", \"w\") as f:\n", + " f.write(f'OA file with PO: {po_num} has to be fixed before running.')\n", + " print(f\"{po_num} has to be fixed before running.\")\n", + "\n", + " continue\n", + " else:\n", + " continue\n", + " \n", + "\n", + "\n", + " # reset index\n", + " temp = temp.reset_index(drop=True)\n", + " \n", + " req = (temp['Unnamed: 3']=='EA') | (temp['Unnamed: 3']=='BOX')\n", + " temp=temp[req]\n", + " \n", + " \n", + " # set as string\n", + " temp.iloc[:,0]=temp.iloc[:,0].astype(str)\n", + "\n", + " # find the last row with product code\n", + " tot=[]\n", + " for idx, m in zip(temp.index,temp.iloc[:,0]):\n", + " \n", + " if re.search('.*\\-\\d{3,4}\\D.*',m) or re.search('.*\\-\\d{3,4}\\s',m) or re.search('.*\\-\\d{3,4}$',m)\\\n", + " or re.search('^\\d{6}\\s',m) or re.search('^\\d{6}\\D.*',m) or re.search('^\\d{6}$',m)\\\n", + " or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\D*',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\s',m)\\\n", + " or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}$',m)\\\n", + " or re.search('^[A-Z]\\d{8}\\D.*',m) or re.search('^[A-Z]\\d{8}\\s.*',m) or re.search('^[A-Z]\\d{8}$',m):\n", + " tot.append(idx)\n", + " else:\n", + " continue\n", + "\n", + " # replace 'nan' with null()\n", + " temp.replace('nan',np.nan, inplace=True)\n", + "\n", + "\n", + " # last row with valid product code\n", + " # ffill empty rows for columns 0 and 3\n", + "\n", + " temp.iloc[:max(tot)+2,0].fillna(method='ffill',inplace=True)\n", + " temp.iloc[:max(tot)+2,3].fillna(method='ffill',inplace=True)\n", + "\n", + " # convert to strings\n", + " temp.iloc[:,0]=temp.iloc[:,0].astype(str)\n", + " \n", + " # extract row index with correct product code\n", + " idx=[]\n", + " for ind,m in zip(temp.index,temp.iloc[:,0]):\n", + " \n", + " if re.search('.*\\-\\d{3,4}\\D.*',m) or re.search('.*\\-\\d{3,4}\\s',m) or re.search('.*\\-\\d{3,4}$',m)\\\n", + " or re.search('^\\d{6}\\s',m) or re.search('^\\d{6}\\D.*',m) or re.search('^\\d{6}$',m)\\\n", + " or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\D*',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\s',m)\\\n", + " or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}$',m)\\\n", + " or re.search('^[A-Z]\\d{8}\\D.*',m) or re.search('^[A-Z]\\d{8}\\s.*',m) or re.search('^[A-Z]\\d{8}$',m):\n", + " idx.append(ind)\n", + " else:\n", + " continue\n", + " \n", + " # extract only rows with correct product code using index\n", + " temp= temp.loc[idx]\n", + " temp=temp.reset_index(drop=True)\n", + " \n", + " # copy to df\n", + " df=temp.copy()\n", + "\n", + " # create new dataframe to transfer all info over\n", + " new = pd.DataFrame(columns=['material_no','material_desc','batch','qty','unit_listprice','unit_discprice',\\\n", + " 'list_pricetotal','disc_pricetotal','po_num']) #-------------------------------------------\n", + "\n", + " # fill in new dataframe (df is the old dataframe)\n", + " matno=[]\n", + " matdes=[]\n", + " # if both mat'l no and mat'l description are in column 1 of old dataframe. This will separate them.\n", + " for i,m,d in zip(df.index,df.iloc[:,0],df.iloc[:,1]): # 1st column\n", + " if re.search('.*\\-\\d{3,4}\\D.*',m) or re.search('.*\\-\\d{3,4}\\s',m)\\\n", + " or re.search('^\\d{6}\\s',m) or re.search('^\\d{6}\\D.*',m)\\\n", + " or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\D*',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\s',m)\\\n", + " or re.search('^[A-Z]\\d{8}\\D.*',m) or re.search('^[A-Z]\\d{8}\\s.*',m):\n", + " res1,res2 = regsplit(m)\n", + " \n", + " new.loc[i,'material_no'] =res1 # 1st column\n", + " new.loc[i,'material_desc'] = res2 # 2nd column\n", + " \n", + " # if both mat'l no and mat'l description are in separate columns of the old dataframe, then no need to separate\n", + " # just assign to the respective columns in the \"new\" dataframe\n", + " elif re.search('.*\\-\\d{3,4}$',m) or re.search('^\\d{6}$',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}$',m)\\\n", + " or re.search('^[A-Z]\\d{8}$',m):\n", + " new.loc[i,'material_no'] = m # 1st column\n", + " new.loc[i,'material_desc'] = d # 2nd column\n", + "\n", + " # fill in batch number\n", + " new.batch = df.iloc[:,3] # 3rd column\n", + " \n", + "\n", + " df.iloc[:,5]=df.iloc[:,5].astype(str)\n", + " for idx,i in zip(df.index,df.iloc[:,5]):\n", + " space = check_space(i)\n", + "\n", + " if space==2:\n", + " unjoin1 = lambda m :str(m).split(' ')[0]\n", + " unjoin2 = lambda m :str(m).split(' ')[1]\n", + " unjoin3 = lambda m :str(m).split(' ')[2]\n", + " new.loc[idx,'qty'] = unjoin1(i)\n", + " new.loc[idx,'unit_listprice'] = unjoin2(i)\n", + " new.loc[idx,'unit_discprice'] = unjoin3(i)\n", + " # copy the rest of the values according to row indexes\n", + " new.loc[idx,'list_pricetotal'] = df.iloc[idx,7]# 5th column\n", + " new.loc[idx,'disc_pricetotal'] = df.iloc[idx,8]\n", + " new.loc[idx,'po_num'] = po_num\n", + "\n", + "\n", + "\n", + " elif space==1:\n", + " # remove error in qty where qty includes unit list price\n", + " unjoin1 = lambda m :str(m).split(' ')[0]\n", + " unjoin2 = lambda m :str(m).split(' ')[1]\n", + " new.loc[idx,'qty'] = unjoin1(i)\n", + " new.loc[idx,'unit_listprice'] = unjoin2(i)\n", + " # copy the rest of the values according to row indexes\n", + " new.loc[idx,'unit_discprice'] = df.iloc[idx,6]\n", + " new.loc[idx,'list_pricetotal'] = df.iloc[idx,7]# 5th column\n", + " new.loc[idx,'disc_pricetotal'] = df.iloc[idx,8]\n", + " new.loc[idx,'po_num'] = po_num\n", + "\n", + " else:\n", + " new.loc[idx,'qty'] = np.nan\n", + " \n", + " # drop row if qty is nan\n", + " new.dropna(subset=['qty'], inplace=True)\n", + " \n", + " # convert null to today's date to allow datatime calculation\n", + " new=new.replace(np.nan,'naan')\n", + " \n", + "\n", + " new['exp_date']=[toMthYr(i) for i in new.batch]\n", + " new['curr_date']=pd.to_datetime('today')\n", + " new['num_months'] = month_diff(new.exp_date, new.curr_date)\n", + " \n", + " \n", + " # convert 'exp_date' back to strings\n", + " new['exp_date'] = [i.strftime(\"%Y-%m\") for i in new['exp_date']]\n", + "\n", + " # reorganize columns of dataframe\n", + " new = new[['material_no','material_desc','batch','qty','unit_listprice','unit_discprice',\\\n", + " 'list_pricetotal','disc_pricetotal','po_num','exp_date','num_months']]\n", + " \n", + " \n", + " new['qty'] = [int(float(i)) for i in new['qty']]\n", + " new['list_pricetotal']=new['list_pricetotal'].str.replace(',',' ').str.replace(' ','')\n", + " new['list_pricetotal'] = [float(i) for i in new['list_pricetotal']]\n", + " new['disc_pricetotal']=new['disc_pricetotal'].str.replace(',',' ').str.replace(' ','')\n", + " new['disc_pricetotal'] = [float(i) for i in new['disc_pricetotal']]\n", + "\n", + " new['unit_listprice'] = [float(i) for i in new['unit_listprice']]\n", + " new['unit_discprice'] = [float(i) for i in new['unit_discprice']]\n", + " new.num_months = [int(i) for i in new.num_months]\n", + "\n", + " # combine new dataframe and others dataframe and assign to OA dataframe\n", + " new = new.sort_values('material_no')\n", + " new = new.reset_index(drop=True)\n", + " \n", + " # if batch number is \"R\" then num_months column should be \"no match.....\"\n", + " for i,b in zip(new.index, new.batch):\n", + " if re.search('\\-R\\d{4}$',b):\n", + " new.loc[i,'num_months']=0\n", + " new['po_numT'] = [i[:4] for i in new.po_num]\n", + "\n", + " return new, others, po_num" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4f6d09c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e82e64d8", + "metadata": {}, + "outputs": [], + "source": [ + "# OA: groupby based on po numbers v = po_num, temp = consolidated OA\n", + "def execute2(temp):\n", + " \n", + " \n", + " # slicing those products with at least 16 months shelve life\n", + " temp_over16 = temp[temp.num_months >= 16]\n", + " temp_under16 = temp[temp.num_months < 16]\n", + " \n", + "\n", + " # reorganizating columns\n", + " temp = temp[['material_no','material_desc','qty','unit_listprice','unit_discprice','list_pricetotal','disc_pricetotal',\\\n", + " 'po_num','po_numT','batch','exp_date','num_months']]\n", + " \n", + " # groupby product code and run based on each PO Number (file has multiple PO Number)\n", + " tot_gby=pd.DataFrame()\n", + " \n", + " for i in temp.po_numT.unique():\n", + " temp1 = temp[temp.po_numT==i]\n", + " temp_gby = temp1.groupby(['material_no','material_desc','unit_listprice','unit_discprice','po_num','po_numT',\n", + " ]).agg(qty=('qty','sum'),list_pricetotal=('list_pricetotal','sum'), \n", + " disc_pricetotal=('disc_pricetotal','sum')).reset_index()\n", + " tot_gby=pd.concat([tot_gby,temp_gby])\n", + " \n", + " \n", + " \n", + " non_promo_gby=pd.DataFrame(columns=tot_gby.columns)\n", + " promo_gby = pd.DataFrame(columns=tot_gby.columns)\n", + " \n", + " tot_gby = tot_gby.reset_index(drop=True)\n", + " _=tot_gby.copy()\n", + " # create dataframe for non promo and promo products on loop (based on conditions)\n", + " for c, l, d in zip(_.index,_.unit_listprice, _.unit_discprice):\n", + "\n", + " if d==0 and l==0 or d