add simon data

bigcode-project · Apr 13, 2024 · e3d2383 · e3d2383
1 parent 8ccc485
commit e3d2383
Show file tree

Hide file tree

Showing 161 changed files with 19,548 additions and 0 deletions.
diff --git a/data/raw/f_644_simon.py b/data/raw/f_644_simon.py
@@ -0,0 +1,119 @@
+import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+
+
+def f_644(list_of_pairs):
+    """
+    Create a Pandas DataFrame from a list of pairs and normalize the data using MinMaxScaler.
+    
+    Parameters:
+    list_of_pairs (list): A list of tuples, where the first element is the category and 
+                          the second element is the value.
+    
+    Returns:
+    DataFrame:  A pandas DataFrame containing the columns 'Category' and 'Value'.
+                Category contains the the first elements of each tuple.
+                Value contains the normalized values of each tuple.
+
+    Raises:
+        Exception: If the input array is empty.
+        ValueError: If Values are not numeric.
+    
+    Requirements:
+    - pandas
+    - sklearn.preprocessing.MinMaxScaler
+    
+    Example:
+    >>> list_of_pairs = [('Fruits', 5), ('Vegetables', 9), ('Dairy', -1), ('Bakery', -2), ('Meat', 4)]
+    >>> df = f_644(list_of_pairs)
+    >>> print(df)
+             Category     Value
+        0      Fruits  0.636364
+        1  Vegetables  1.000000
+        2       Dairy  0.090909
+        3      Bakery  0.000000
+        4        Meat  0.545455
+    >>> list_of_pairs = [('car', 3.2), ('bike', 0), ('train', -1), ('plane', -6.2), ('ship', 1234)]
+    >>> df = f_644(list_of_pairs)
+    >>> print(df)
+          Category     Value
+        0      car  0.007579
+        1     bike  0.004999
+        2    train  0.004193
+        3    plane  0.000000
+        4     ship  1.000000
+            """
+
+    if len(list_of_pairs) == 0:
+        raise Exception('The input array should not be empty.')
+
+    df = pd.DataFrame(list_of_pairs, columns=['Category', 'Value'])
+
+    if pd.api.types.is_numeric_dtype(df.Value) is not True:
+        raise ValueError('The values have to be numeric.')
+
+    scaler = MinMaxScaler()
+    df['Value'] = scaler.fit_transform(df[['Value']])
+
+    return df
+
+
+import unittest
+
+class TestCases(unittest.TestCase):
+    def test_case_1(self):
+        '''test with normal input data'''
+        input_data = [('traditional', -4), ('we', 7), ('because', 3), ('ability', 10), ('exactly', -7)]
+        result = f_644(input_data)
+        self.assertTrue(isinstance(result, pd.DataFrame))
+        self.assertTrue('Value' in result.columns)
+        self.assertAlmostEqual(result[result['Category'] == 'traditional']['Value'].item(), 0.176471, places=6)
+        self.assertAlmostEqual(result[result['Category'] == 'we']['Value'].item(), 0.823529, places=6)
+        self.assertAlmostEqual(result[result['Category'] == 'because']['Value'].item(), 0.588235, places=6)
+        self.assertAlmostEqual(result[result['Category'] == 'ability']['Value'].item(), 1.000000, places=6)
+        self.assertAlmostEqual(result[result['Category'] == 'exactly']['Value'].item(), 0.000000, places=6)
+
+
+    def test_case_2(self):
+        '''test empty input'''
+        input_data = []
+        self.assertRaises(Exception, f_644, input_data)
+
+    def test_case_3(self):
+        '''non numeric values'''
+        input_data = [('fast', 'test'), ('ago', -8), ('player', 7), ('standard', 2), ('specific', 0)]
+        self.assertRaises(Exception, f_644, input_data)
+
+
+    def test_case_4(self):
+        '''Floating point values'''
+        input_data = [('real', 4.453), ('others', -1.12), ('professor', -2.2), ('other', -5), ('task', -7.933)]
+        result = f_644(input_data)
+        self.assertTrue(isinstance(result, pd.DataFrame))
+        self.assertTrue('Value' in result.columns)
+        self.assertAlmostEqual(result[result['Category'] == 'real']['Value'].item(), 1.000000, places=6)
+        self.assertAlmostEqual(result[result['Category'] == 'others']['Value'].item(), 0.550057, places=6)
+        self.assertAlmostEqual(result[result['Category'] == 'professor']['Value'].item(), 0.462861, places=6)
+        self.assertAlmostEqual(result[result['Category'] == 'other']['Value'].item(), 0.236800, places=6)
+        self.assertAlmostEqual(result[result['Category'] == 'task']['Value'].item(), 0.000000, places=6)
+
+
+    def test_case_5(self):
+        '''test for basic output structure'''
+        input_data = [('visit', 4), ('brother', -2), ('experience', -10), ('whether', 8), ('hand', 3)]
+        result = f_644(input_data)
+        self.assertTrue(isinstance(result, pd.DataFrame))
+        self.assertTrue('Value' in result.columns)
+        self.assertTrue('Category' in result.columns)
+        self.assertTrue(0 <= result['Value'].min() <= 1)
+        self.assertTrue(0 <= result['Value'].max() <= 1)
+
+def run_tests():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCases))
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
+
+if __name__ == "__main__":
+
+    run_tests()
diff --git a/data/raw/f_645_simon.py b/data/raw/f_645_simon.py
@@ -0,0 +1,137 @@
+import nltk
+from string import punctuation
+import pandas as pd
+
+
+def f_645(text):
+    """
+    Finds all words in a text, that are seperated by whitespace, 
+    beginning with the "$" character and computes their number of occurences.
+
+    Parameters:
+    text (str): The input text.
+
+    Returns:
+    DataFrame: A pandas DataFrame with two columns: "Word" and "Frequency". 
+               "Word" contains the '$' prefixed words, and "Frequency" contains their occurrences.
+
+               
+    Raises:
+    ValueError: if text is not a string
+    
+    Requirements:
+    - nltk
+    - string
+    - pandas
+
+    The function ignores words that are entirely made up of punctuation, even if they start with a '$'.
+
+    Example:
+    >>> text = "$abc def $efg $hij klm $ $abc $abc $hij $hij"
+    >>> f_645(text)
+        Word  Frequency
+    0   $abc      3
+    1   $efg      1
+    2   $hij      3
+
+    >>> text = "$hello this i$s a $test $test $test"
+    >>> f_645(text)
+        Word	Frequency
+    0	$hello	1
+    1	$test	3
+    """
+    if not isinstance(text, str):
+        raise ValueError("The input should be a string.")
+
+    tk = nltk.WhitespaceTokenizer()
+    words = tk.tokenize(text)    
+    dollar_words = [word for word in words if word.startswith('$') and not all(c in set(punctuation) for c in word)]
+    freq = nltk.FreqDist(dollar_words)
+    df = pd.DataFrame(list(freq.items()), columns=["Word", "Frequency"])
+    return df
+
+import unittest
+
+
+class TestCases(unittest.TestCase):
+
+    def test_case_1(self):
+        text = "$abc def $efg $hij klm $ $abc $abc $hij $hij"
+        result = f_645(text)
+        expected_words = ["$abc", "$efg", "$hij"]
+        expected_freqs = [3, 1, 3]
+        self.assertListEqual(result["Word"].tolist(), expected_words)
+        self.assertListEqual(result["Frequency"].tolist(), expected_freqs)
+
+    def test_case_2(self):
+        text = "This is a test without dollar words."
+        result = f_645(text)
+        self.assertEqual(len(result), 0)
+
+    def test_case_3(self):
+        text = "$test1 $test2 $test1 $test3"
+        result = f_645(text)
+        expected_words = ["$test1", "$test2", "$test3"]
+        expected_freqs = [2, 1, 1]
+        self.assertListEqual(result["Word"].tolist(), expected_words)
+        self.assertListEqual(result["Frequency"].tolist(), expected_freqs)
+
+    def test_case_4(self):
+        text = "$! $$ $a $a $a"
+        result = f_645(text)
+        expected_words = ["$a"]
+        expected_freqs = [3]
+        self.assertListEqual(result["Word"].tolist(), expected_words)
+        self.assertListEqual(result["Frequency"].tolist(), expected_freqs)
+
+    def test_case_5(self):
+        text = "$word1 word2 $word2 $word1 $word3 $word1"
+        result = f_645(text)
+        expected_words = ["$word1", "$word2", "$word3"]
+        expected_freqs = [3, 1, 1]
+        self.assertListEqual(result["Word"].tolist(), expected_words)
+        self.assertListEqual(result["Frequency"].tolist(), expected_freqs)
+
+    def test_case_6(self):
+        '''empty input string'''
+        text = ""
+        result = f_645(text)
+        expected_words = []
+        expected_freqs = []
+        self.assertListEqual(result["Word"].tolist(), expected_words)
+        self.assertListEqual(result["Frequency"].tolist(), expected_freqs)
+
+    def test_case_7(self):
+        '''check for correct return type'''
+        text = "$test 123 abcd.aef"
+        result = f_645(text)
+        self.assertTrue(isinstance(result, pd.DataFrame))
+        self.assertTrue('Word' in result.columns)
+        self.assertTrue('Frequency' in result.columns)
+
+    def test_case_8(self):
+        '''word with $ in the middle'''
+        text = "asdfj;alskdfj;$kjhkjhdf"
+        result = f_645(text)
+        expected_words = []
+        expected_freqs = []
+        self.assertListEqual(result["Word"].tolist(), expected_words)
+        self.assertListEqual(result["Frequency"].tolist(), expected_freqs)
+
+    def test_case_9(self):
+        '''non string input'''
+        input = 24
+        self.assertRaises(Exception, f_645, input)
+
+
+
+
+def run_tests():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCases))
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
+
+
+if __name__ == "__main__":
+    run_tests()