-
Notifications
You must be signed in to change notification settings - Fork 0
/
머신러닝 - 언어 감지 함수.py
61 lines (50 loc) · 1.63 KB
/
머신러닝 - 언어 감지 함수.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt
def normalize_list(original_list):
total_cnt = sum(original_list)
normalized_list = []
for i in original_list:
normalized_list.append(i/total_cnt)
return normalized_list
def frequency_alphabet(text):
text = text.lower() # 소문자 변환
# 알파벳 출현 빈도 구하기
code_a = ord("a")
code_z = ord("z")
cnt = [0 for n in range(0, 26)] # 26개의 0
for char in text:
code_current = ord(char)
if code_a <= code_current <= code_z:
cnt[code_current - code_a] += 1
return normalize_list(cnt)
def get_data_label(folder_name):
files = glob.glob("./머신러닝/language/{}/*.txt".format(folder_name)) # 폴더 내 텍스트 파일 추출
data = []
label = []
for fname in files:
# 레이블 구하기
basename = os.path.basename(fname)
lang = basename.split("-")[0]
# 텍스트 추출하기
with open(fname, "r", encoding="utf-8") as f:
text = f.read()
cnt = frequency_alphabet(text)
# 리스트에 넣기
label.append(lang)
data.append(cnt)
return data, label
def show_me_the_graph(data, label):
# 그래프 준비하기
graph_dict = {}
for i in range(0, len(data)):
y = label[i]
x = normalize_list(data[i])
if not (y in graph_dict):
graph_dict[y] = x
asclist = [[chr(n) for n in range(97, 97 + 26)]]
df = pd.DataFrame(graph_dict, index=asclist)
# 바그래프
df.plot(kind='bar', subplots=True, ylim=(0, 0.15))
plt.show()