From 922c8e1eebc28b2254360ddb8e8285c8f366d14b Mon Sep 17 00:00:00 2001
From: stephen520254 <81504133+stephen520254@users.noreply.github.com>
Date: Wed, 31 Jan 2024 13:19:00 +0800
Subject: [PATCH] Add files via upload

---
 orderList_v4_beta_1nov_rebecca.ipynb | 1039 ++++++++++++++++++++++++++
 1 file changed, 1039 insertions(+)
 create mode 100644 orderList_v4_beta_1nov_rebecca.ipynb

diff --git a/orderList_v4_beta_1nov_rebecca.ipynb b/orderList_v4_beta_1nov_rebecca.ipynb
new file mode 100644
index 0000000..f6b65f0
--- /dev/null
+++ b/orderList_v4_beta_1nov_rebecca.ipynb
@@ -0,0 +1,1039 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "12a493cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "import re\n",
+    "from datetime import datetime as dt\n",
+    "pd.set_option('display.max_rows', None)\n",
+    "import tabula\n",
+    "from tabula.io import read_pdf\n",
+    "import os\n",
+    "import sys"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "beeebc12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # rebecca\n",
+    "\n",
+    "# path = r'C:\\Users\\rebecca.lee\\OneDrive - GC International AG\\Rebecca Lee\\orderListData\\input_data\\\\'\n",
+    "\n",
+    "# path_output = r'C:\\Users\\rebecca.lee\\OneDrive - GC International AG\\Rebecca Lee\\orderListData\\output_data\\\\'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "939a7f99",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# stephen\n",
+    "\n",
+    "path_output = r'C:\\Users\\stephenl\\OneDrive - GC International AG\\ML\\Trial\\data_output\\\\'\n",
+    "\n",
+    "path = r'C:\\Users\\stephenl\\OneDrive - GC International AG\\ML\\Trial\\data_input\\\\'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfbb97f6",
+   "metadata": {},
+   "source": [
+    "# Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c0907986",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# separate product code and product packaging\n",
+    "def splitnJoin(x):\n",
+    "    remove0 = lambda x: x.split(' ')[0]\n",
+    "    remove1 = lambda x: x.split(' ')[1:]\n",
+    "    res1 = remove0(x)\n",
+    "    _=remove1(x)\n",
+    "    res2 = ' '.join(_)\n",
+    "    return res1, res2\n",
+    "\n",
+    "nandate = lambda x: dt.now() if x is null() else x\n",
+    "\n",
+    "# function to extract PO number\n",
+    "def extractPo(df_):\n",
+    "    po = df_[0]\n",
+    "    po=po.fillna('nann')\n",
+    "    po_total = []\n",
+    "    for p in po.iloc[:,3]:\n",
+    "        if re.search(r'^[A-Z]\\d{3}\\s',p) or re.search(r'^[A-Z]\\d{3}',p) or re.search(r'^[0-9]*$',p):\n",
+    "            p=p.replace('/','_').replace('-','_')\n",
+    "            po_total.append(p)\n",
+    "    if len(po_total)>0:\n",
+    "        return ''.join(po_total)\n",
+    "    else:\n",
+    "        return 'no po number available'\n",
+    "\n",
+    "\n",
+    "\n",
+    "# extract expiry month and year\n",
+    "def toMthYr(s):\n",
+    "    if re.search('\\-[A-Z]\\d{4}$',s):\n",
+    "        return dt.strptime(s[-4:][0:2]+'-'+s[-4:][2:4],\"%m-%y\")\n",
+    "    else:\n",
+    "        return dt.now()\n",
+    "\n",
+    "\n",
+    "# calculate date to expiry\n",
+    "def month_diff(x,y):\n",
+    "    end = x.dt.to_period('M').view(dtype='int64')\n",
+    "    start = y.dt.to_period('M').view(dtype='int64')\n",
+    "    return end - start\n",
+    "\n",
+    "def otherCharges(df_):\n",
+    "    df=df_[-1]\n",
+    "    \n",
+    "    ship_instr = ship(df_)\n",
+    "    df_new = pd.DataFrame(columns=['material_no','material_desc','batch','qty','unit_listprice','unit_discprice',\\\n",
+    "                                        'list_pricetotal','disc_pricetotal','po_num','exp_date','num_months'])\n",
+    "    \n",
+    "    df_new.loc[0,'exp_date']='Shipping Instructions'\n",
+    "    df_new.loc[0,'num_months']=ship_instr\n",
+    "    if 'Freight Charge' in df.iloc[:,0].tolist():\n",
+    "        df_new.loc[1,'num_months']=df[df.iloc[:,0]=='Freight Charge'].Discount.iloc[0]\n",
+    "        df_new.loc[1,'exp_date']='Freight Charge'\n",
+    "    if 'Insurance Charge' in df.iloc[:,0].tolist():\n",
+    "        df_new.loc[2,'num_months']=df[df.iloc[:,0]=='Insurance Charge'].Discount.iloc[0]\n",
+    "        df_new.loc[2,'exp_date']='Insurance Charge'\n",
+    "    if 'Packing Charge' in df.iloc[:,0].tolist():\n",
+    "        df_new.loc[3,'num_months']=df[df.iloc[:,0]=='Packing Charge'].Discount.iloc[0]\n",
+    "        df_new.loc[3,'exp_date']='Packing Charge'\n",
+    "    if 'Total Excluding GST' in df.iloc[:,4].tolist():\n",
+    "        df_new.loc[4,'num_months']=df[df.iloc[:,4]=='Total Excluding GST'].Discount.iloc[0]\n",
+    "        df_new.loc[4,'exp_date']='Total Excluding GST'\n",
+    "    if 'Other Charges' in df.iloc[:,4].tolist():\n",
+    "        df_new.loc[5,'num_months']=df[df.iloc[:,4]=='Other Charges'].Discount.iloc[0]\n",
+    "        df_new.loc[5,'exp_date']='Other Charges'\n",
+    "    if 'Add GST @ 0%' in df.iloc[:,4].tolist():\n",
+    "        df_new.loc[6,'num_months']=df[df.iloc[:,4]=='Add GST @ 0%'].Discount.iloc[0]\n",
+    "        df_new.loc[6,'exp_date']='Add GST @ 0%'\n",
+    "    if 'Add GST @ 7%' in df.iloc[:,4].tolist():\n",
+    "        df_new.loc[6,'num_months']=df[df.iloc[:,4]=='Add GST @ 7%'].Discount.iloc[0]\n",
+    "        df_new.loc[6,'exp_date']='Add GST @ 7%'\n",
+    "    if 'Invoice Total Including GST' in df.iloc[:,4].tolist():\n",
+    "        df_new.loc[7,'num_months']=df[df.iloc[:,4]=='Invoice Total Including GST'].Discount.iloc[0]\n",
+    "        df_new.loc[7,'exp_date']='Invoice Total Including GST'\n",
+    "        \n",
+    "    for idx, i in zip(df_new.index, df_new.num_months):\n",
+    "        try:\n",
+    "            df_new.loc[idx,'num_months']=float(i)\n",
+    "        except:\n",
+    "            continue\n",
+    "        \n",
+    "    return df_new\n",
+    "\n",
+    "\n",
+    "def regsplit(m):\n",
+    "    trim = lambda x: ' '.join(x.split())\n",
+    "    g = re.search('.*\\-\\d{3,4}[A-Z]',m) or re.search('^\\d{6}',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}',m)\\\n",
+    "    or re.search('^[A-Z]\\d{8}',m) or re.search('.*\\-\\d{3,4}',m)\n",
+    "    g = g.group()\n",
+    "    g2 = m.replace(g,'')\n",
+    "    return trim(g),trim(g2)\n",
+    "\n",
+    "# returns only\n",
+    "def regsplit2(m):\n",
+    "    trim = lambda x: ' '.join(x.split())\n",
+    "    g = re.search('.*\\-\\d{3,4}[A-Z]',m) or re.search('^.*\\d{6}',m) or re.search('^.*[A-Z]{2}\\d{2}\\-[A-Z]{2}',m)\\\n",
+    "    or re.search('^.*[A-Z]\\d{8}',m) or re.search('.*\\-\\d{3,4}',m)\n",
+    "    g = g.group()\n",
+    "    return trim(g)\n",
+    "\n",
+    "def check_space(string):\n",
+    "     \n",
+    "    # counter\n",
+    "    count = 0\n",
+    "    \n",
+    "    # loop for search each index\n",
+    "    for i in range(0, len(string)):\n",
+    "         \n",
+    "        # Check each char\n",
+    "        # is blank or not\n",
+    "        if string[i] == \" \":\n",
+    "            count += 1\n",
+    "         \n",
+    "    return count\n",
+    "\n",
+    "# function to return shipping instructions\n",
+    "def ship(df_):\n",
+    "    si=df_[0]\n",
+    "    si=si.dropna(subset=['Unnamed: 2'])\n",
+    "    if 'Shipping Instructions:' in si['Unnamed: 2'].tolist():\n",
+    "        idx = si[si['Unnamed: 2']=='Shipping Instructions:'].index[0]\n",
+    "        si_list = si.loc[idx+1:,'Unnamed: 2']\n",
+    "        si_list1 = si_list.tolist()\n",
+    "        si_list2 = ','.join(si_list1)\n",
+    "    else:\n",
+    "        si_list2 = 'No Shipping Instructions Available'\n",
+    "    return si_list2\n",
+    "\n",
+    "\n",
+    "def NPpcodesPI(i): \n",
+    "    return pi_np[pi_np['material_no']==i]\n",
+    "\n",
+    "def NPpcodesOA(i):\n",
+    "    return oa_np[oa_np['material_no']==i]\n",
+    "\n",
+    "def PpcodesPI(i): \n",
+    "    return pi_p[pi_p['material_no']==i]\n",
+    "\n",
+    "def PpcodesOA(i):\n",
+    "    return oa_p[oa_p['material_no']==i]\n",
+    "\n",
+    "def expiry():\n",
+    "    a = temp_under16\n",
+    "    poNum = a.po_numT.unique()\n",
+    "    numPdts = a.qty.sum()\n",
+    "    numPCodes = len(a.material_no.unique())\n",
+    "    return poNum, numPdts, numPCodes\n",
+    "\n",
+    "def numPo():\n",
+    "    a = all_pi_df\n",
+    "    pos = a['customer_purchase_order_#_T'].unique()\n",
+    "    numPos = len(a['customer_purchase_order_#_T'].unique())\n",
+    "    return pos, numPos\n",
+    "\n",
+    "# extract 2 columns \n",
+    "def trimm(i):\n",
+    "    return i[['material_no','po_numT']]\n",
+    "\n",
+    "# convert series to list\n",
+    "def listit(seri):\n",
+    "    pc = []\n",
+    "    for w in seri:\n",
+    "        pc.append(w)\n",
+    "    return pc\n",
+    "\n",
+    "\n",
+    "# # final status output for reconciliation\n",
+    "# def NPstatus():\n",
+    "#     a = NP_status[NP_status.status=='Discrepancies in OA']\n",
+    "#     mat = a.material_no\n",
+    "#     po = a.po_numT.unique()\n",
+    "#     req = (NP_status.status=='Missing code in OA') | (NP_status.status=='Missing code in PI')\n",
+    "#     b = NP_status[req]\n",
+    "#     pc = b.material_no\n",
+    "#     pom = b.po_numT.unique()\n",
+    "#     return listit(po),listit(mat), listit(pc),listit(pom)\n",
+    "\n",
+    "\n",
+    "# final status output for reconciliation\n",
+    "def NPstatus():\n",
+    "    a = NP_status[NP_status.status=='Discrepancies in OA']\n",
+    "    mat = a.material_no.unique()\n",
+    "    po = a.po_numT.unique()\n",
+    "    req = (NP_status.status=='Missing code in OA') | (NP_status.status=='Missing code in PI')\n",
+    "    b = NP_status[req]\n",
+    "    pc = b.material_no.unique()\n",
+    "    pom = b.po_numT.unique()\n",
+    "    return listit(po),listit(mat), listit(pc),listit(pom)\n",
+    "\n",
+    "\n",
+    "def Pstatus():\n",
+    "    a = P_status[P_status.status=='Discrepancies in OA'] \n",
+    "    mat = a.material_no.unique()\n",
+    "    po = a.po_numT.unique()\n",
+    "    req = (P_status.status=='Missing code in OA') | (P_status.status=='Missing code in PI')\n",
+    "    b = P_status[req]\n",
+    "    pc = b.material_no.unique()\n",
+    "    pom = b.po_numT.unique()\n",
+    "    return listit(po),listit(mat), listit(pc),listit(pom)\n",
+    "\n",
+    "def overallPO():\n",
+    "    a = all_pi_df\n",
+    "    pc = len(all_pi_df['customer_purchase_order_#'].unique())\n",
+    "    return pc\n",
+    "\n",
+    "missingtext = lambda x: 'Missing code in PI' if x=='Discrepancies in OA' else 'Missing code in OA' if x=='Correct items in PI' else \\\n",
+    "'NA'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "562244d3",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "def execute(df_):\n",
+    "    \n",
+    "    # extract customer po & shipping instructions\n",
+    "\n",
+    "    po_num = extractPo(df_)\n",
+    "\n",
+    "    # create dataframe for packing charge etc\n",
+    "    others = otherCharges(df_)\n",
+    "    \n",
+    "    \n",
+    "    # create dataframes of all relevant rows\n",
+    "\n",
+    "    # length of dataframe (raw)\n",
+    "    leng = len(df_)\n",
+    "\n",
+    "    # create new dataframe\n",
+    "    temp = pd.DataFrame()\n",
+    "\n",
+    "    # concat dataframes with 9 columns which have product codes\n",
+    "\n",
+    "    for i in range(0, leng):\n",
+    "        \n",
+    "        if df_[i].shape[1]==9:\n",
+    "            temp=pd.concat([temp,df_[i]])\n",
+    "\n",
+    "        elif df_[i].shape[1]>9:\n",
+    "            with open(path_output+f\"Alert_{po_num}_.txt\", \"w\") as f:\n",
+    "                f.write(f'OA file with PO: {po_num} has to be fixed before running.')\n",
+    "            print(f\"{po_num} has to be fixed before running.\")\n",
+    "\n",
+    "            continue\n",
+    "        else:\n",
+    "            continue\n",
+    "            \n",
+    "\n",
+    "\n",
+    "    # reset index\n",
+    "    temp = temp.reset_index(drop=True)\n",
+    "    \n",
+    "    req = (temp['Unnamed: 3']=='EA') | (temp['Unnamed: 3']=='BOX')\n",
+    "    temp=temp[req]\n",
+    "    \n",
+    "    \n",
+    "    # set as string\n",
+    "    temp.iloc[:,0]=temp.iloc[:,0].astype(str)\n",
+    "\n",
+    "    # find the last row with product code\n",
+    "    tot=[]\n",
+    "    for idx, m in zip(temp.index,temp.iloc[:,0]):\n",
+    "        \n",
+    "        if re.search('.*\\-\\d{3,4}\\D.*',m) or re.search('.*\\-\\d{3,4}\\s',m) or re.search('.*\\-\\d{3,4}$',m)\\\n",
+    "            or re.search('^\\d{6}\\s',m) or re.search('^\\d{6}\\D.*',m) or re.search('^\\d{6}$',m)\\\n",
+    "            or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\D*',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\s',m)\\\n",
+    "        or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}$',m)\\\n",
+    "            or re.search('^[A-Z]\\d{8}\\D.*',m) or re.search('^[A-Z]\\d{8}\\s.*',m) or re.search('^[A-Z]\\d{8}$',m):\n",
+    "            tot.append(idx)\n",
+    "        else:\n",
+    "            continue\n",
+    "\n",
+    "    # replace 'nan' with null()\n",
+    "    temp.replace('nan',np.nan, inplace=True)\n",
+    "\n",
+    "\n",
+    "    # last row with valid product code\n",
+    "    # ffill empty rows for columns 0 and 3\n",
+    "\n",
+    "    temp.iloc[:max(tot)+2,0].fillna(method='ffill',inplace=True)\n",
+    "    temp.iloc[:max(tot)+2,3].fillna(method='ffill',inplace=True)\n",
+    "\n",
+    "    # convert to strings\n",
+    "    temp.iloc[:,0]=temp.iloc[:,0].astype(str)\n",
+    "    \n",
+    "    # extract row index with correct product code\n",
+    "    idx=[]\n",
+    "    for ind,m in zip(temp.index,temp.iloc[:,0]):\n",
+    "        \n",
+    "        if re.search('.*\\-\\d{3,4}\\D.*',m) or re.search('.*\\-\\d{3,4}\\s',m) or re.search('.*\\-\\d{3,4}$',m)\\\n",
+    "        or re.search('^\\d{6}\\s',m) or re.search('^\\d{6}\\D.*',m) or re.search('^\\d{6}$',m)\\\n",
+    "        or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\D*',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\s',m)\\\n",
+    "    or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}$',m)\\\n",
+    "        or re.search('^[A-Z]\\d{8}\\D.*',m) or re.search('^[A-Z]\\d{8}\\s.*',m) or re.search('^[A-Z]\\d{8}$',m):\n",
+    "                idx.append(ind)\n",
+    "        else:\n",
+    "            continue\n",
+    "    \n",
+    "    # extract only rows with correct product code using index\n",
+    "    temp= temp.loc[idx]\n",
+    "    temp=temp.reset_index(drop=True)\n",
+    "    \n",
+    "    # copy to df\n",
+    "    df=temp.copy()\n",
+    "\n",
+    "    # create new dataframe to transfer all info over\n",
+    "    new = pd.DataFrame(columns=['material_no','material_desc','batch','qty','unit_listprice','unit_discprice',\\\n",
+    "                                'list_pricetotal','disc_pricetotal','po_num']) #-------------------------------------------\n",
+    "\n",
+    "    # fill in  new dataframe (df is the old dataframe)\n",
+    "    matno=[]\n",
+    "    matdes=[]\n",
+    "        # if both mat'l no and mat'l description are in column 1 of old dataframe. This will separate them.\n",
+    "    for i,m,d in zip(df.index,df.iloc[:,0],df.iloc[:,1]): # 1st column\n",
+    "        if re.search('.*\\-\\d{3,4}\\D.*',m) or re.search('.*\\-\\d{3,4}\\s',m)\\\n",
+    "        or re.search('^\\d{6}\\s',m) or re.search('^\\d{6}\\D.*',m)\\\n",
+    "        or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\D*',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\s',m)\\\n",
+    "        or re.search('^[A-Z]\\d{8}\\D.*',m) or re.search('^[A-Z]\\d{8}\\s.*',m):\n",
+    "            res1,res2 = regsplit(m)\n",
+    "            \n",
+    "            new.loc[i,'material_no'] =res1 # 1st column\n",
+    "            new.loc[i,'material_desc'] = res2 # 2nd column\n",
+    "        \n",
+    "        # if both mat'l no and mat'l description are in separate columns of the old dataframe, then no need to separate\n",
+    "        # just assign to the respective columns in the \"new\" dataframe\n",
+    "        elif re.search('.*\\-\\d{3,4}$',m) or re.search('^\\d{6}$',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}$',m)\\\n",
+    "        or re.search('^[A-Z]\\d{8}$',m):\n",
+    "            new.loc[i,'material_no'] = m # 1st column\n",
+    "            new.loc[i,'material_desc'] = d # 2nd column\n",
+    "\n",
+    "    # fill in batch number\n",
+    "    new.batch = df.iloc[:,3] # 3rd column\n",
+    "    \n",
+    "\n",
+    "    df.iloc[:,5]=df.iloc[:,5].astype(str)\n",
+    "    for idx,i in zip(df.index,df.iloc[:,5]):\n",
+    "        space = check_space(i)\n",
+    "\n",
+    "        if space==2:\n",
+    "            unjoin1 = lambda m :str(m).split(' ')[0]\n",
+    "            unjoin2 = lambda m :str(m).split(' ')[1]\n",
+    "            unjoin3 = lambda m :str(m).split(' ')[2]\n",
+    "            new.loc[idx,'qty'] = unjoin1(i)\n",
+    "            new.loc[idx,'unit_listprice'] = unjoin2(i)\n",
+    "            new.loc[idx,'unit_discprice'] = unjoin3(i)\n",
+    "            # copy the rest of the values according to row indexes\n",
+    "            new.loc[idx,'list_pricetotal'] = df.iloc[idx,7]# 5th column\n",
+    "            new.loc[idx,'disc_pricetotal'] = df.iloc[idx,8]\n",
+    "            new.loc[idx,'po_num'] = po_num\n",
+    "\n",
+    "\n",
+    "\n",
+    "        elif space==1:\n",
+    "            # remove error in qty where qty includes unit list price\n",
+    "            unjoin1 = lambda m :str(m).split(' ')[0]\n",
+    "            unjoin2 = lambda m :str(m).split(' ')[1]\n",
+    "            new.loc[idx,'qty'] = unjoin1(i)\n",
+    "            new.loc[idx,'unit_listprice'] = unjoin2(i)\n",
+    "            # copy the rest of the values according to row indexes\n",
+    "            new.loc[idx,'unit_discprice'] = df.iloc[idx,6]\n",
+    "            new.loc[idx,'list_pricetotal'] = df.iloc[idx,7]# 5th column\n",
+    "            new.loc[idx,'disc_pricetotal'] = df.iloc[idx,8]\n",
+    "            new.loc[idx,'po_num'] = po_num\n",
+    "\n",
+    "        else:\n",
+    "            new.loc[idx,'qty'] = np.nan\n",
+    "    \n",
+    "    # drop row if qty is nan\n",
+    "    new.dropna(subset=['qty'], inplace=True)\n",
+    "    \n",
+    "    # convert null to today's date to allow datatime calculation\n",
+    "    new=new.replace(np.nan,'naan')\n",
+    "    \n",
+    "\n",
+    "    new['exp_date']=[toMthYr(i) for i in new.batch]\n",
+    "    new['curr_date']=pd.to_datetime('today')\n",
+    "    new['num_months'] = month_diff(new.exp_date, new.curr_date)\n",
+    "    \n",
+    "    \n",
+    "    # convert 'exp_date' back to strings\n",
+    "    new['exp_date'] = [i.strftime(\"%Y-%m\") for i in new['exp_date']]\n",
+    "\n",
+    "    # reorganize columns of dataframe\n",
+    "    new = new[['material_no','material_desc','batch','qty','unit_listprice','unit_discprice',\\\n",
+    "                                'list_pricetotal','disc_pricetotal','po_num','exp_date','num_months']]\n",
+    "    \n",
+    "    \n",
+    "    new['qty'] = [int(float(i)) for i in new['qty']]\n",
+    "    new['list_pricetotal']=new['list_pricetotal'].str.replace(',',' ').str.replace(' ','')\n",
+    "    new['list_pricetotal'] = [float(i) for i in new['list_pricetotal']]\n",
+    "    new['disc_pricetotal']=new['disc_pricetotal'].str.replace(',',' ').str.replace(' ','')\n",
+    "    new['disc_pricetotal'] = [float(i) for i in new['disc_pricetotal']]\n",
+    "\n",
+    "    new['unit_listprice'] = [float(i) for i in new['unit_listprice']]\n",
+    "    new['unit_discprice'] = [float(i) for i in new['unit_discprice']]\n",
+    "    new.num_months = [int(i) for i in new.num_months]\n",
+    "\n",
+    "    # combine new dataframe and others dataframe and assign to OA dataframe\n",
+    "    new = new.sort_values('material_no')\n",
+    "    new = new.reset_index(drop=True)\n",
+    "    \n",
+    "    # if batch number is \"R\" then num_months column should be \"no match.....\"\n",
+    "    for i,b in zip(new.index, new.batch):\n",
+    "        if re.search('\\-R\\d{4}$',b):\n",
+    "            new.loc[i,'num_months']=0\n",
+    "    new['po_numT'] = [i[:4] for i in new.po_num]\n",
+    "\n",
+    "    return new, others, po_num"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4f6d09c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e82e64d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# OA: groupby based on po numbers v = po_num, temp = consolidated OA\n",
+    "def execute2(temp):\n",
+    "    \n",
+    "    \n",
+    "    # slicing those products with at least 16 months shelve life\n",
+    "    temp_over16 = temp[temp.num_months >= 16]\n",
+    "    temp_under16 = temp[temp.num_months < 16]\n",
+    "    \n",
+    "\n",
+    "    # reorganizating columns\n",
+    "    temp = temp[['material_no','material_desc','qty','unit_listprice','unit_discprice','list_pricetotal','disc_pricetotal',\\\n",
+    "                 'po_num','po_numT','batch','exp_date','num_months']]\n",
+    "    \n",
+    "    # groupby product code and run based on each PO Number (file has multiple PO Number)\n",
+    "    tot_gby=pd.DataFrame()\n",
+    "    \n",
+    "    for i in temp.po_numT.unique():\n",
+    "        temp1 = temp[temp.po_numT==i]\n",
+    "        temp_gby = temp1.groupby(['material_no','material_desc','unit_listprice','unit_discprice','po_num','po_numT',\n",
+    "                     ]).agg(qty=('qty','sum'),list_pricetotal=('list_pricetotal','sum'), \n",
+    "                                        disc_pricetotal=('disc_pricetotal','sum')).reset_index()\n",
+    "        tot_gby=pd.concat([tot_gby,temp_gby])\n",
+    "                \n",
+    "                \n",
+    "                \n",
+    "    non_promo_gby=pd.DataFrame(columns=tot_gby.columns)\n",
+    "    promo_gby = pd.DataFrame(columns=tot_gby.columns)\n",
+    "    \n",
+    "    tot_gby = tot_gby.reset_index(drop=True)\n",
+    "    _=tot_gby.copy()\n",
+    "    # create dataframe for non promo and promo products on loop (based on conditions)\n",
+    "    for c, l, d in zip(_.index,_.unit_listprice, _.unit_discprice):\n",
+    "\n",
+    "        if d==0 and l==0 or d<l:\n",
+    "            promo_gby = promo_gby.append(_.loc[c])\n",
+    "        else:\n",
+    "            non_promo_gby = non_promo_gby.append(_.loc[c])\n",
+    "\n",
+    "    return non_promo_gby, promo_gby, temp_under16, tot_gby\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "063f5c4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# remove all other rows besides those which has prinicpal item code\n",
+    "def pi_transform(pi):\n",
+    "    \n",
+    "    # remove unwanted strings before product codes\n",
+    "    def regsplit2(m):\n",
+    "        trim = lambda x: ' '.join(x.split())\n",
+    "        g = re.search('.*\\-\\d{3,4}[A-Z]',m) or re.search('^.*\\d{6}',m) or re.search('^.*[A-Z]{2}\\d{2}\\-[A-Z]{2}',m)\\\n",
+    "        or re.search('^.*[A-Z]\\d{8}',m) or re.search('.*\\-\\d{3,4}',m)\n",
+    "        g = g.group()\n",
+    "        return trim(g)\n",
+    "\n",
+    "    pi.columns = pi.columns.str.lower()\n",
+    "    pi.columns = pi.columns.str.replace(' ','_')\n",
+    "    pi=pi.astype(str)\n",
+    "    \n",
+    "    \n",
+    "    # create new dataframe for transformation\n",
+    "    pi_df_1=pd.DataFrame()\n",
+    "    \n",
+    "    # transform \n",
+    "    pi['principal_item_code']=[' '.join(i.split()) for i in pi['principal_item_code']]\n",
+    "    pi['customer_purchase_order_#']= [' '.join(i.split()) for i in pi['customer_purchase_order_#']]\n",
+    "    \n",
+    "    pi['item_qty'] = [int(float(i)) for i in pi['item_qty']]\n",
+    "    pi['item_list_price']=pi['item_list_price'].str.replace(',',' ').str.replace(' ','')\n",
+    "    pi['item_list_price'] = [float(i) for i in pi['item_list_price']]\n",
+    "    pi['item_unit_price']=pi['item_unit_price'].str.replace(',',' ').str.replace(' ','')\n",
+    "    pi['item_unit_price'] = [float(i) for i in pi['item_unit_price']]\n",
+    "    \n",
+    "    # reorganize pi dataframe\n",
+    "    pi_df = pi[['principal_item_code','item_qty','item_list_price','item_unit_price',\\\n",
+    "                'customer_purchase_order_#']]\n",
+    "    # replaceing black slash with _\n",
+    "    pi_df['customer_purchase_order_#']= [i.replace('/','_').replace('-','_')\\\n",
+    "                                         for i in pi_df['customer_purchase_order_#']]\n",
+    "    \n",
+    "\n",
+    "    # extract only rows with principal_item_code\n",
+    "    for idx, m in zip(pi_df.index,pi_df.principal_item_code):\n",
+    "        if re.search('.*\\-\\d{3,4}\\D.*',m) or re.search('.*\\-\\d{3,4}\\s',m)\\\n",
+    "            or re.search('^\\d{6}\\s',m) or re.search('^\\d{6}\\D.*',m)\\\n",
+    "            or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\D*',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}\\s',m)\\\n",
+    "            or re.search('^[A-Z]\\d{8}\\D.*',m) or re.search('^[A-Z]\\d{8}\\s.*',m) or re.search('.*\\-\\d{3,4}$',m)\\\n",
+    "            or re.search('^\\d{6}$',m) or re.search('^[A-Z]{2}\\d{2}\\-[A-Z]{2}$',m) or re.search('^[A-Z]\\d{8}$',m):\n",
+    "            # concatentate each row\n",
+    "            pi_df_1 = pd.concat([pi_df_1, pi_df.loc[[idx]]])\n",
+    "            \n",
+    "    \n",
+    "    # correcting principal item code\n",
+    "    pi_df_1['principal_item_code']=[regsplit2(i) for i in pi_df_1['principal_item_code']]\n",
+    "    \n",
+    "\n",
+    "    temp=pd.DataFrame()\n",
+    "    pi_df_1['customer_purchase_order_#'] = [' '.join(i.split()) for i in pi_df_1['customer_purchase_order_#']]\n",
+    "    \n",
+    "    for i in pi_df_1['customer_purchase_order_#'].unique():\n",
+    "\n",
+    "        # groupby by customer purchase order\n",
+    "        p = pi_df_1[pi_df_1['customer_purchase_order_#']==i]\n",
+    "        p2 = p.groupby(['principal_item_code','item_list_price','item_unit_price','customer_purchase_order_#']).\\\n",
+    "        agg(qty=('item_qty','sum'))\n",
+    "    \n",
+    "        temp = pd.concat([temp, p2])\n",
+    "        \n",
+    "    temp = temp.reset_index()\n",
+    "    \n",
+    "    # create another column for short form PO number\n",
+    "    temp['customer_purchase_order_#_T']=[i[:4] for i in temp['customer_purchase_order_#']]\n",
+    "    \n",
+    "    \n",
+    "    # tranform data type pi_gby\n",
+    "    temp.item_list_price=[float(i) for i in temp.item_list_price]\n",
+    "    temp.item_unit_price= [float(i) for i in temp.item_unit_price]\n",
+    "    temp.qty= [int(i) for i in temp.qty]\n",
+    "    \n",
+    "    \n",
+    "    non_promo_pi_df = pd.DataFrame()\n",
+    "    promo_pi_df = pd.DataFrame()\n",
+    "        \n",
+    "    # create dataframe for non promo and promo products on loop\n",
+    "    for c, l, d in zip(temp.index,temp.item_list_price, temp.item_unit_price):\n",
+    "        if d==0 and l==0 or d<l:\n",
+    "            promo_pi_df = promo_pi_df.append(temp.loc[c])\n",
+    "        else:\n",
+    "            non_promo_pi_df = non_promo_pi_df.append(temp.loc[c])\n",
+    "                \n",
+    "    \n",
+    "    return temp, non_promo_pi_df, promo_pi_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2dea4f52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# comparison between PI and OA (non promo)\n",
+    "def execute_np1(non_promo_pi_df,non_promo_gby):\n",
+    "    \n",
+    "#     oa_accu = pd.DataFrame(columns = ['material_no','unit_listprice','unit_discprice','po_numT'])\n",
+    "#     pi_accu = pd.DataFrame(columns = oa_accu.columns)\n",
+    "\n",
+    "    # extract 4 columns\n",
+    "    non_promo_gby = non_promo_gby[['material_no','qty','unit_listprice','unit_discprice','po_numT']]\n",
+    "    non_promo_pi_df = non_promo_pi_df[['principal_item_code','qty','item_list_price','item_unit_price',\\\n",
+    "                                       'customer_purchase_order_#_T']]\n",
+    "    \n",
+    "    \n",
+    "    # synchronize column names\n",
+    "    non_promo_pi_df.rename(columns={'principal_item_code':'material_no','item_list_price':'unit_listprice',\n",
+    "                                   'item_unit_price':'unit_discprice','customer_purchase_order_#_T':'po_numT'},inplace=True)\n",
+    "    \n",
+    "    non_promo_gby = non_promo_gby.reset_index(drop=True)\n",
+    "    non_promo_pi_df = non_promo_pi_df.reset_index(drop=True)\n",
+    "    \n",
+    "    pi_np = non_promo_pi_df.copy()\n",
+    "    oa_np = non_promo_gby.copy()\n",
+    "    \n",
+    "    pi_np = pi_np.sort_values(['material_no', 'po_numT'])\n",
+    "    pi_np = pi_np.reset_index(drop=True)\n",
+    "    oa_np = oa_np.sort_values(['material_no', 'po_numT'])\n",
+    "    oa_np = oa_np.reset_index(drop=True)\n",
+    "    \n",
+    "    diff = pd.concat([pi_np,oa_np]).drop_duplicates(keep=False)\n",
+    "\n",
+    "    return diff, pi_np, oa_np\n",
+    "    \n",
+    " "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "3ae284ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def execute_p1(promo_pi_df,promo_gby):\n",
+    "    \n",
+    "#     piInOa_accu = pd.DataFrame(columns = ['material_no','unit_listprice','unit_discprice','po_numT'])\n",
+    "#     piNotinOa_accu = pd.DataFrame(columns = piInOa_accu.columns)\n",
+    "#     oaNotinPi_accu = pd.DataFrame(columns = piInOa_accu.columns)\n",
+    "    \n",
+    "    # extract 4 columns\n",
+    "    promo_gby = promo_gby[['material_no','qty','unit_listprice','unit_discprice','po_numT']]\n",
+    "    promo_pi_df = promo_pi_df[['principal_item_code','qty','item_list_price','item_unit_price',\\\n",
+    "                                       'customer_purchase_order_#_T']]\n",
+    "    \n",
+    "    \n",
+    "    # synchronize column names\n",
+    "    promo_pi_df.rename(columns={'principal_item_code':'material_no','item_list_price':'unit_listprice',\n",
+    "                                   'item_unit_price':'unit_discprice','customer_purchase_order_#_T':'po_numT'},inplace=True)\n",
+    "    \n",
+    "    promo_gby = promo_gby.reset_index(drop=True)\n",
+    "    promo_pi_df = promo_pi_df.reset_index(drop=True)\n",
+    "    \n",
+    "    \n",
+    "    pi_p = promo_pi_df.copy()\n",
+    "    oa_p = promo_gby.copy()\n",
+    "    \n",
+    "    \n",
+    "    pi_p = pi_p.sort_values(['material_no', 'po_numT'])\n",
+    "    pi_p = pi_p.reset_index(drop=True)\n",
+    "    oa_p = oa_p.sort_values(['material_no', 'po_numT'])\n",
+    "    oa_p = oa_p.reset_index(drop=True)\n",
+    "    \n",
+    "    diff = pd.concat([pi_p,oa_p]).drop_duplicates(keep=False)\n",
+    "\n",
+    "    \n",
+    "    return diff, pi_p, oa_p\n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2cb1e08",
+   "metadata": {},
+   "source": [
+    "# PDF file input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7d7f4fb4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converting OA PDF files to Excel Workbook. Please wait a few minutes......\n",
+      "Consolidating OA files.......\n"
+     ]
+    }
+   ],
+   "source": [
+    "# read excel sheet of input files and assign to df\n",
+    "\n",
+    "\n",
+    "# df = pd.read_excel('input_files2.xlsx')\n",
+    "\n",
+    "# make list of OA and PI files\n",
+    "# df=df.fillna(0)\n",
+    "oa_files = []\n",
+    "pi_files = []\n",
+    "other_files = []\n",
+    "            \n",
+    "            \n",
+    "# iterate thru the files\n",
+    "for x in os.listdir(path):\n",
+    "    try:\n",
+    "        filename, extension = os.path.splitext(x)\n",
+    "#             print(filename, extension)\n",
+    "        if extension == '.pdf':\n",
+    "            oa_files.append(x)\n",
+    "        elif extension == '.csv':\n",
+    "            pi_files.append(x)\n",
+    "        else:\n",
+    "            other_files.append(x)\n",
+    "    except:\n",
+    "        continue\n",
+    "                \n",
+    "        \n",
+    "        \n",
+    "print(\"Converting OA PDF files to Excel Workbook. Please wait a few minutes......\")\n",
+    "# concatenate OA files\n",
+    "attach=pd.DataFrame()\n",
+    "\n",
+    "oa_po = pd.DataFrame(columns=['oa_filename','po_number'])\n",
+    "\n",
+    "      \n",
+    "#counter\n",
+    "e=[] # empty list to contain po_num (which recurr)\n",
+    "h=0 # row number for OA and PI reference file\n",
+    "j=1 # counter for po_num file name (in case where there are recurring po_num)\n",
+    "for i in oa_files:\n",
+    "    df_ = read_pdf(path+i, pages='all', encoding='latin1')\n",
+    "    if len(df_)==0:\n",
+    "        with open(path_output+\"Alert.txt\",\"w\") as b:\n",
+    "            b.write(f\"{i} is corrupted. Please remove this OA and PI from data_input folder and run again....terminating sequence.\")\n",
+    "        print(f\"{i} is corrupted. Please remove this OA and PI from data_input folder and run again....terminating sequence.\")\n",
+    "        sys.exit()\n",
+    "    else:\n",
+    "        new, others, po_num=execute(df_)\n",
+    "    \n",
+    "    # combine body and other info such as packing charges and del po_numT column\n",
+    "    converted = pd.concat([new,others])\n",
+    "    \n",
+    "    del converted['po_numT']\n",
+    "\n",
+    "    # save fully converted OA record\n",
+    "    \n",
+    "    if po_num in e:\n",
+    "        po_num=po_num+'_'+str(j)\n",
+    "        converted.to_excel(path_output+f'1_ponum_{po_num}.xlsx', index=False)\n",
+    "        j+=1\n",
+    "    else:\n",
+    "        converted.to_excel(path_output+f'1_ponum_{po_num}.xlsx', index=False)\n",
+    "        e.append(po_num)\n",
+    "        \n",
+    "        \n",
+    "    # Create OA and PO reference file\n",
+    "    oa_po.loc[h,'oa_filename']=i\n",
+    "    oa_po.loc[h,'po_number']=po_num\n",
+    "    h+=1\n",
+    "\n",
+    "    attach = pd.concat([attach,new])\n",
+    "    attach = attach.reset_index(drop=True)\n",
+    "\n",
+    "# reference to indicate PO numbers belong to which OA filenames\n",
+    "oa_po.to_excel(path_output+'OA and PO Numbers.xlsx', index=False)\n",
+    "\n",
+    "\n",
+    "# execute2\n",
+    "# groupby OAs by  PO numbers\n",
+    "\n",
+    "print(\"Consolidating OA files.......\")\n",
+    "\n",
+    "non_promo_gby, promo_gby, temp_under16, tot_gby = execute2(attach) # execution step\n",
+    "\n",
+    "non_promo_gby.to_excel(path_output+'2_OA Non_Promo Consolidated file.xlsx',index=False)\n",
+    "\n",
+    "promo_gby.to_excel(path_output+'2_OA Promo Consolidated file.xlsx',index=False)\n",
+    "\n",
+    "temp_under16.to_excel(path_output+'2_OA under16 Consolidated file.xlsx',index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "6a765f8b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Transforming PI files.......\n",
+      "Comparing PI and OA files........\n",
+      "Processing completed.......\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Transforming PI files.......\")\n",
+    "# transform PI files\n",
+    "non_promo_pi_df = pd.DataFrame(columns=['principal_item_code','item_list_price','item_unit_price',\n",
+    "                                       'customer_purchase_order_#','qty','customer_purchase_order_#_T'])\n",
+    "\n",
+    "promo_pi_df = pd.DataFrame(columns=non_promo_pi_df.columns)\n",
+    "all_pi_df = pd.DataFrame(columns=non_promo_pi_df.columns)\n",
+    "for i in pi_files:\n",
+    "    pi = pd.read_csv(path+i, encoding='latin1')\n",
+    "    pi_df, non_promo_pi,promo_pi = pi_transform(pi)\n",
+    "    \n",
+    "    # concatentate non_promo and promo files\n",
+    "    non_promo_pi_df = pd.concat([non_promo_pi_df, non_promo_pi])\n",
+    "    promo_pi_df = pd.concat([promo_pi_df, promo_pi])\n",
+    "    all_pi_df = pd.concat([all_pi_df,pi_df])\n",
+    "\n",
+    "# save non promo PI flles\n",
+    "non_promo_pi_df.to_excel(path_output+f\"3_PI Non_Promo Consolidated.xlsx\", index=False)\n",
+    "\n",
+    "# save promo PI files\n",
+    "promo_pi_df.to_excel(path_output+f\"3_PI Promo Consolidated.xlsx\", index=False)\n",
+    "\n",
+    "# save promo PI files\n",
+    "all_pi_df.to_excel(path_output+f\"3_PI All Consolidated.xlsx\", index=False)\n",
+    "\n",
+    "\n",
+    "print(\"Comparing PI and OA files........\")\n",
+    "\n",
+    "\n",
+    "# Non Promo comparison between PI and OA #----------------------------------------------------\n",
+    "\n",
+    "NP_diff, pi_np, oa_np  = execute_np1(non_promo_pi_df, non_promo_gby)\n",
+    "\n",
+    "# overall diff between pi_np and oa_np. Reset index to ensure no duplicate index\n",
+    "NP_diff=NP_diff.reset_index(drop=True)\n",
+    "\n",
+    "# extract items belonging to OA in NP_diff\n",
+    "x = pd.concat([NP_diff,oa_np])\n",
+    "dup_NP_diff_oa_np = x[x.duplicated(keep='first')]\n",
+    "\n",
+    "# create new 'status' column\n",
+    "dup_NP_diff_oa_np['status']='Discrepancies in OA'\n",
+    "dup_NP_diff_oa_np=dup_NP_diff_oa_np.reset_index(drop=True)\n",
+    "    \n",
+    "# extract items belonging to PI in NP_diff\n",
+    "y = pd.concat([NP_diff,pi_np])\n",
+    "dup_NP_diff_pi_np = y[y.duplicated(keep='first')]\n",
+    "\n",
+    "# These are in PI so, not discrepancy\n",
+    "dup_NP_diff_pi_np['status']='Correct items in PI'\n",
+    "\n",
+    "\n",
+    "NP_diff, pi_np, oa_np  = execute_np1(non_promo_pi_df, non_promo_gby)\n",
+    "\n",
+    "# overall diff between pi_np and oa_np. Reset index to ensure no duplicate index\n",
+    "NP_diff=NP_diff.reset_index(drop=True)\n",
+    "\n",
+    "# # extract items belonging to OA in NP_diff\n",
+    "# x = pd.concat([NP_diff,oa_np])\n",
+    "# dup_NP_diff_oa_np = x[x.duplicated(keep='first')]\n",
+    "\n",
+    "# extract items belonging to OA in NP_diff\n",
+    "oa_npT = oa_np.drop_duplicates(keep='first')\n",
+    "x = pd.concat([NP_diff,oa_npT])\n",
+    "dup_NP_diff_oa_np = x[x.duplicated(keep='first')]\n",
+    "\n",
+    "# create new 'status' column\n",
+    "dup_NP_diff_oa_np['status']='Discrepancies in OA'\n",
+    "dup_NP_diff_oa_np=dup_NP_diff_oa_np.reset_index(drop=True)\n",
+    "\n",
+    "# extract items belonging to PI in NP_diff\n",
+    "pi_npT = pi_np.drop_duplicates(keep='first')\n",
+    "y = pd.concat([NP_diff,pi_npT])\n",
+    "dup_NP_diff_pi_np = y[y.duplicated(keep='first')]\n",
+    "\n",
+    "# These are in PI so, not discrepancy\n",
+    "dup_NP_diff_pi_np['status']='Correct items in PI'\n",
+    "\n",
+    "# extract missing code by concatenating both groups using only material no and po numT and changing status\n",
+    "j = pd.concat([dup_NP_diff_oa_np,dup_NP_diff_pi_np])\n",
+    "missingCode = j[~j.duplicated(subset=['material_no','po_numT'],keep=False)]\n",
+    "missingCode = missingCode.reset_index(drop=True)\n",
+    "for i,m in zip(missingCode.index,missingCode.status):\n",
+    "    missingCode.loc[i,'status']=missingtext(m)\n",
+    "    \n",
+    "tot = missingCode.copy()\n",
+    "\n",
+    "# resulting status\n",
+    "NP_status = pd.concat([dup_NP_diff_pi_np,dup_NP_diff_oa_np,tot])\n",
+    "NP_status=NP_status.reset_index(drop=True)\n",
+    "\n",
+    "# drop duplicate items due to missing codes repeating either in PI or OA\n",
+    "NP_status=NP_status.drop_duplicates(subset=['material_no','qty','unit_listprice','unit_discprice','po_numT'],keep='last')\n",
+    "NP_status.to_excel(path_output+'4_Non Promo reconciliation.xlsx', index=False)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Promo comparison between PI and OA #----------------------------------------------------\n",
+    "\n",
+    "P_diff, pi_p, oa_p = execute_p1(promo_pi_df, promo_gby)\n",
+    "\n",
+    "# overall diff between pi_p and oa_p. Reset index to ensure no duplicate index\n",
+    "P_diff=P_diff.reset_index(drop=True)\n",
+    "\n",
+    "# extract items belonging to OA in P_diff\n",
+    "oa_pT= oa_p.drop_duplicates(keep='first')\n",
+    "k=pd.concat([P_diff, oa_pT])\n",
+    "dup_P_diff_oa_p = k[k.duplicated(keep='first')]\n",
+    "\n",
+    "# create new 'status' column\n",
+    "dup_P_diff_oa_p['status']='Discrepancies in OA'\n",
+    "dup_P_diff_oa_p=dup_P_diff_oa_p.reset_index(drop=True)\n",
+    "\n",
+    "# extract items belonging to PI in P_diff\n",
+    "pi_pT = pi_p.drop_duplicates(keep='first')\n",
+    "v=pd.concat([P_diff, pi_pT])\n",
+    "dup_P_diff_pi_p = v[v.duplicated(keep='first')]\n",
+    "dup_P_diff_pi_p=dup_P_diff_pi_p.reset_index(drop=True)\n",
+    "\n",
+    "# These are items in PI\n",
+    "dup_P_diff_pi_p['status']='Correct items in PI'\n",
+    "\n",
+    "\n",
+    "# extract missing code by concatenating both groups using only material no and po numT and changing status\n",
+    "j = pd.concat([dup_P_diff_oa_p,dup_P_diff_pi_p])\n",
+    "missingCode = j[~j.duplicated(subset=['material_no','po_numT'],keep=False)]\n",
+    "missingCode = missingCode.reset_index(drop=True)\n",
+    "for i,m in zip(missingCode.index,missingCode.status):\n",
+    "    missingCode.loc[i,'status']=missingtext(m)\n",
+    "    \n",
+    "tot = missingCode.copy()\n",
+    "\n",
+    "# concatenate altogether\n",
+    "P_status = pd.concat([dup_P_diff_pi_p,dup_P_diff_oa_p,tot])\n",
+    "P_status=P_status.reset_index(drop=True)\n",
+    "\n",
+    "# remove duplicates of missing code\n",
+    "P_status=P_status.drop_duplicates(subset=['material_no','qty','unit_listprice','unit_discprice','po_numT'],keep='last')\n",
+    "P_status.to_excel(path_output+'4_Promo reconciliation.xlsx', index=False)\n",
+    "\n",
+    "\n",
+    "# write to text file\n",
+    "with open(path_output+\"summary.txt\",'w') as t:\n",
+    "    t.write(f\"Num of POs processed: {overallPO()}\\n\")\n",
+    "    t.write(f\"{expiry()[2]} products under expiry from PO Num: {expiry()[0]}\\n\\n\")\n",
+    "    t.write(f\"Non promo discrepancies in OA for PO Num: {NPstatus()[0]}, Product Code: {NPstatus()[1]}\\n\")\n",
+    "    t.write(f\"Non Promo missing codes in PO Num: {NPstatus()[3]}, Product Code: {NPstatus()[2]}\\n\\n\")\n",
+    "    t.write(f\"Promo discrepancies in OA for PO Num: {Pstatus()[0]}, Product Code: {Pstatus()[1]}\\n\")\n",
+    "    t.write(f\"Promo missing codes in PO Num: {Pstatus()[3]}, Product Code: {Pstatus()[2]}\\n\")\n",
+    "\n",
+    "\n",
+    "print(\"Processing completed.......\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a205d6a6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}