From e981792affbb27cf494da023343ebb3531c1e6d8 Mon Sep 17 00:00:00 2001
From: harpunius <tobias_sommer@hotmail.com>
Date: Sat, 14 Oct 2017 22:00:42 +0200
Subject: [PATCH] Multiple authors + various  (#7)

* Changes Email regex to use multiline command as otherwise the Start of String (^) and End of String ($) fail to work in a text-block. Previously reported a stopspace-error on mail@krourke.org in the test.txt file.

* Strips fields when printing ApaMatch objects.

* Utilises and simplifies common YEAR pattern in other patterns
Implements ampintextref and fixes andinbracketref

* Removes (commented-out) refdatedot pattern
* Changes for loops to for-each loops
* Outputs appropriate feedback for ampintextref.
---
 pyapa/pyapa.py | 83 ++++++++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 37 deletions(-)

diff --git a/pyapa/pyapa.py b/pyapa/pyapa.py
index 2913136..47e36d4 100644
--- a/pyapa/pyapa.py
+++ b/pyapa/pyapa.py
@@ -36,14 +36,14 @@ def print(self):
             print("Match from " + str(self.start) + " to " + str(self.end)
                   + " for:")
             if self.target:
-                print("Target: " + self.target)
+                print("Target: " + self.target.strip())
         if self.feedback:
-            print("Feedback: " + self.feedback)
+            print("Feedback: " + self.feedback.strip())
         if self.see:
             print("See: " + self.see)
         if self.suggestions:
             for s in self.suggestions:
-                print("Suggestion: " + s)
+                print("Suggestion: " + s.strip())
 
     def sprint(self):
         string = ''
@@ -67,16 +67,18 @@ def sprint(self):
 # defines patterns for common errors and builds an array of ApaMatch objects
 class ApaCheck:
     # pattern for year, YYYY or n.d.
-    YEAR = r"(\d\d\d\d|n\.d\.)"
+    YEAR = r"(\b\d{4}|n\.d\.)"
 
     # set up regex matching for common mistakes with one word of context
 
     # letters that appear immediately after a year should be lowercase
-    yearletter = re.compile(r"(\b\d\d\d\d[A-Z][),])")
+    yearletter = re.compile(r"(\b\d{4}[A-Z][),])")
 
     # do not put a comma before 'et al.'
     etalcomma = re.compile(r"((\w+), et al\..{0,5})")
 
+    # TODO: Incorrect to assume that any reference should be followed by a period
+    '''
     # put a period after the date in a reference
     refdatedot = re.compile(r"""((\w+\s+)                  # context
                                 (                          # start group
@@ -86,40 +88,42 @@ class ApaCheck:
                                 [^.]                       # error
                                 ))                         # close group
                             """, re.X)
+    '''
 
     # if this is an article, consider using lowercase
     # detects titlecase within a reference
     reftitlecase = re.compile(r"""(\)\.\s*(?:[A-Z][^\s]*\s?)+(\.))""")
 
     # every period should be followed by a space
-    # remove URLs from file
+    # unmatched by URL patterns
     stopspace = re.compile(r"""((\w+)                 # context
                                (?!\b).\.[^ ,\n0-9)]   # match
                                (\w+))                 # context
                            """, re.X)
+    
     # multiple references should be combined with a semicolon
     joinrefstyle = re.compile(
-        r"\([^)]+(\b\d\d\d\d|n\.d\.)\)([\s+,;]*\([^)]+(\b\d\d\d\d|n\.d\.)\))+"
+        r"\([^)]+"+YEAR+"\)([\s+,;]*\([^)]+"+YEAR+"\))+"
     )
 
     # place the period after an in-text citation
-    refbeforedot = re.compile(r"(\.\s+\([^)]+(\b\d\d\d\d|n\.d\.)\))")
-
-    #if only the year is in brackets for an in-text citation, use "and" to
-    #separate author names
-    #intextciteand   = re.compile(r"""(.{0,5}           # context
-    #                                .{0,5})            # context
-    #                             """, re.X)
+    refbeforedot = re.compile(r"(\.\s+\([^)]+"+YEAR+"\))")
+   
+    # if only the year is in brackets for an in-text citation, use "and" to
+    # separate author names
+    # Hans & Yorke (2006) contradict ...
+    ampintextref = re.compile(r"[\w\s]+\s&\s[\w\s]+\("+YEAR+"\)")
 
     # if author names are in brackets for an in-text citation, use an & to
     # separate them
-    textciteamp = re.compile(r"(\([^)]+\sand\s[^)]+\s(\b\d\d\d\d|n\.d\.)\))")
-
+    # ... literature (Hans and Yorke, 2006).
+    andinbracketref = re.compile(r"\([^)]+\sand\s[^)]+\s"+YEAR+"\)")
+    
     # patterns to be exempted
     url = re.compile(r"(http|ftp|file|https)://([\w_-]+(?:(?:\.[\w_-]+)+))"
                      + r"([\w.\b]*[\w\b])")
-    # Needs multiline in order to incorporate SoS (^) and EoS ($) in a text block
     email = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.MULTILINE)
+
     
     # init ApaCheck object with an optional context length
     def __init__(self):
@@ -131,75 +135,80 @@ def match(self, text):
         # find matches for each regexp
         matchList = list(re.finditer(self.yearletter, text))
         matchList += list(re.finditer(self.etalcomma, text))
-        matchList += list(re.finditer(self.refdatedot, text))
+        #matchList += list(re.finditer(self.refdatedot, text))
         matchList += list(re.finditer(self.reftitlecase, text))
         matchList += list(re.finditer(self.stopspace, text))
         matchList += list(re.finditer(self.joinrefstyle, text))
         matchList += list(re.finditer(self.refbeforedot, text))
+        matchList += list(re.finditer(self.ampintextref, text))
+        matchList += list(re.finditer(self.andinbracketref, text))
 
         unmatchList = list(re.finditer(self.url, text))
         unmatchList += list(re.finditer(self.email, text))
 
         # append to Match array
-        for i in range(len(matchList)):
+        for match in matchList:
             newMatch = ApaMatch()
-            newMatch.start = matchList[i].start()
-            newMatch.end = matchList[i].end()
+            newMatch.start = match.start()
+            newMatch.end = match.end()
             newMatch.target = text[newMatch.start:newMatch.end]
 
             suggestion = ""
 
-            if matchList[i].re == self.yearletter:
+            if match.re == self.yearletter:
                 newMatch.feedback = (u"Letters that appear immediately after a year should be lowercase.")
                 suggestion = newMatch.target.lower()
 
-            elif matchList[i].re == self.etalcomma:
+            elif match.re == self.etalcomma:
                 newMatch.feedback = (u"Do not put a comma before 'et al.'")
                 newMatch.see = "http://academicguides.waldenu.edu/writingcenter/apa/citations/etal"
                 suggestion = re.sub(r", ", " ", newMatch.target)
 
-            elif matchList[i].re == self.refdatedot:
-                newMatch.feedback = (u"References go at the end of sentences; end your sentence after the reference.")
-                s = newMatch.target
-                suggestion = re.sub(r"\)\s", r").", newMatch.target)
+                '''
+                elif match.re == self.refdatedot:
+                    newMatch.feedback = (u"References go at the end of sentences; end your sentence after the reference.")
+                    s = newMatch.target
+                    suggestion = re.sub(r"\)\s", r").", newMatch.target)
+                '''
 
-            elif matchList[i].re == self.reftitlecase:
+            elif match.re == self.reftitlecase:
                 newMatch.feedback = (u"If this is an article title, consider using lowercase.")
                 s = newMatch.target.lower()
                 s = s[4:]
                 suggestion = newMatch.target[:4]
                 suggestion += s
 
-            elif matchList[i].re == self.stopspace:
+            elif match.re == self.stopspace:
                 newMatch.feedback = (u"Every period should be followed by a space.")
                 suggestion = re.sub(r"\.", r". ", newMatch.target)
 
                 # double check that this isn't a false positive
-                for i in range(len(unmatchList)):
-                    unMatch = unmatchList[i]
+                for unMatch in unmatchList:
                     falsePositive = text[unMatch.start():unMatch.end()]
                     if re.search(newMatch.target, falsePositive):
                         newMatch = ""
                         suggestion = ""
                         break
 
-            elif matchList[i].re == self.joinrefstyle:
+            elif match.re == self.joinrefstyle:
                 newMatch.feedback = (u"Multiple parentheticals should be combined using a semicolon.")
                 newMatch.see = "http://www.apastyle.org/learn/faqs/references-in-parentheses.aspx"
                 suggestion = re.sub(r"\)[\s+,;]*\(", r"; ", newMatch.target)
 
-            elif matchList[i].re == self.refbeforedot:
+            elif match.re == self.refbeforedot:
                 newMatch.feedback = (u"In text citations belong as part of the preceeding sentence. Place the period after the citation.")
                 s = newMatch.target[:2]
                 suggestion = newMatch.target[2:]
                 suggestion += s
 
-            elif matchList[i].re == self.textciteamp:
-                print("Match")
-                newMatch.feedback = (u"Use & to separate bracketed author names.")
+            elif match.re == self.andinbracketref:
+                newMatch.feedback = (u"Use '&' to separate parenthesized author names.")
+                suggestion = re.sub(r"&", r"and", newMatch.target)
                 newMatch.see = "http://blog.apastyle.org/apastyle/2011/02/changes-parentheses-bring.html"
-                suggestion = re.sub(r"and", r"&", newMatch.target)
 
+            elif match.re == self.ampintextref:
+                newMatch.feedback = (u"Use 'and' to separate in-text author names.")
+                suggestion = re.sub(r"and", r"&", newMatch.target)
 
             if newMatch and suggestion:
                 newMatch.suggestions.append(suggestion)