Skip to content

Commit

Permalink
added extract social regex and social methods
Browse files Browse the repository at this point in the history
  • Loading branch information
talaatmagdyx committed Jul 2, 2023
1 parent e8f1f9c commit ffd81fd
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 1 deletion.
2 changes: 2 additions & 0 deletions lib/socials_regex.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@
module SocialsRegex
autoload :Platforms, 'socials_regex/platforms'
autoload :Regexes, 'socials_regex/platforms'
autoload :Socials, 'socials_regex/socials'
autoload :Extraction, 'socials_regex/socials'
end
4 changes: 3 additions & 1 deletion lib/socials_regex/platforms.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# frozen_string_literal: true

module SocialsRegex
# all supported platforms
class Platforms
PLATFORM_FACEBOOK = 'facebook'
PLATFORM_GITHUB = 'github'
Expand Down Expand Up @@ -39,6 +40,7 @@ def self.show(const_name:)
end
end

# all regex for all platforms
class Regexes
ANGELLIST_URL_REGEX = {
# https://angel.co/company/twitter, https://angel.co/company/twitter/culture
Expand Down Expand Up @@ -225,7 +227,7 @@ class Regexes

YELP_URL_REGEX = {
# https://www.yelp.com/biz/example-business
company: %r{(?:https?://)?(?:www\.)?yelp\.com/biz/([A-Za-z0-9_-]+)}
company: %r{(?:https?://)?(?:www\.)?yelp\.com/biz/(?<company>[A-Za-z0-9_-]+)}
}.freeze

def self.match?(input_str:, regex:)
Expand Down
94 changes: 94 additions & 0 deletions lib/socials_regex/socials.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,99 @@

module SocialsRegex
class Socials
PLATFORMS_REGEX = {
"#{SocialsRegex::Platforms::PLATFORM_YELP}": SocialsRegex::Regexes::YELP_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_WHATSAPP}": SocialsRegex::Regexes::WHATSAPP_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_STACKEXCHANGE_NETWORK}":
SocialsRegex::Regexes::STACKEXCHANGE_NETWORK_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_CRUNCHBASE}": SocialsRegex::Regexes::CRUNCHBASE_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_ANGELLIST}": SocialsRegex::Regexes::ANGELLIST_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_XING}": SocialsRegex::Regexes::XING_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_VIMEO}": SocialsRegex::Regexes::VIMEO_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_TELEGRAM}": SocialsRegex::Regexes::TELEGRAM_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_STACKOVERFLOW}": SocialsRegex::Regexes::STACKOVERFLOW_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_STACKEXCHANGE}": SocialsRegex::Regexes::STACKEXCHANGE_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_SNAPCHAT}": SocialsRegex::Regexes::SNAPCHAT_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_SKYPE}": SocialsRegex::Regexes::SKYPE_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_REDDIT}": SocialsRegex::Regexes::REDDIT_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_PHONE}": SocialsRegex::Regexes::PHONE_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_MEDIUM}": SocialsRegex::Regexes::MEDIUM_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_HACKER_NEWS}": SocialsRegex::Regexes::HACKERNEWS_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_EMAIL}": SocialsRegex::Regexes::EMAIL_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_YOUTUBE}": SocialsRegex::Regexes::YOUTUBE_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_INSTAGRAM}": SocialsRegex::Regexes::INSTAGRAM_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_TWITTER}": SocialsRegex::Regexes::TWITTER_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_LINKEDIN}": SocialsRegex::Regexes::LINKEDIN_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_GITHUB}": SocialsRegex::Regexes::GITHUB_URL_REGEX,
"#{SocialsRegex::Platforms::PLATFORM_FACEBOOK}": SocialsRegex::Regexes::FACEBOOK_URL_REGEX
}.freeze

ERROR_MSG_UNKNOWN_PLATFORM = "Unknown platform, expected one of #{PLATFORMS_REGEX.keys}"
end

# Extracted profiles.
class Extraction
attr_accessor :text

def initialize(text:)
@text = text
end

# Get lists of profiles keyed by platform name.
# :return: a dictionary with the platform as a key, and a list of the platform's profiles as values.
def extract_matches_per_platform
matches = {}
Socials::PLATFORMS_REGEX.each do |platform, regexes|
matches.merge!(platform_matches(regexes: regexes, platform: platform))
end
matches
end

# Find all matches for a specific regex.
# :param regex: platform to search for.
# :return: list of matches.
def extract_matches_by_regex(regex:)
matches(regex: regex)
end

# Find all matches for a specific platform.
# :param platform: platform to search for.
# :return: hash of list of matches.
def extract_matches_by_platform(platform:)
regexes = Socials::PLATFORMS_REGEX[platform.to_sym]
raise Socials::ERROR_MSG_UNKNOWN_PLATFORM unless regexes

platform_matches(regexes: regexes, platform: platform)
end

private

# Find all matches for a specific platform.
# :param regexes:
# :params platform:
# :return: hash of list of matches.
def platform_matches(regexes:, platform:)
matches = {}
regexes.each do |key, regex|
matched = matches(regex: regex)
next if matched.empty?

matches[platform] ||= {}
matches[platform][key] = matched
end
matches
end

def matches(regex:)
reformat_matches(matches: text.to_enum(:scan, regex).map { Regexp.last_match })
end

def reformat_matches(matches:)
reformat = []
matches.each do |match|
reformat << { matched: match[0] }.merge!(match.named_captures)
end
reformat
end
end
end
21 changes: 21 additions & 0 deletions sig/socials_regex/extraction.rbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
module SocialsRegex
class Extraction
attr_accessor text: string

def initialize: (text: string)-> void

def extract_matches_per_platform: -> Hash[Symbol, Array[Hash[Symbol, string]]]

def extract_matches_by_regex: (regex: Regexp | string) -> Array[Hash[Symbol, string]]

def extract_matches_by_platform: (platform: string)-> Hash[Symbol, Array[Hash[Symbol, string]]]

private

def platform_matches: (regexes: Hash[Symbol, Regexp], platform: string | Symbol) -> Hash[Symbol, Array[Hash[Symbol, string]]]

def matches: (regex: Regexp | string) -> Array[Hash[Symbol, string]]

def reformat_matches: (matches: Array[MatchData]) -> Array[Hash[Symbol, string]]
end
end
6 changes: 6 additions & 0 deletions sig/socials_regex/socials.rbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module SocialsRegex
class Socials
ERROR_MSG_UNKNOWN_PLATFORM: string
PLATFORMS_REGEX: Hash[Symbol, Hash[Symbol, Regexp]]
end
end
121 changes: 121 additions & 0 deletions spec/socials_regex/extraction_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# frozen_string_literal: true

RSpec.describe SocialsRegex::Extraction do
describe '#initialize' do
it 'uses that text' do
text = 'https://twitter.com/karllorey/status/1259924082067374088'
extraction = described_class.new(text: text)
expect(extraction.text).to eq text
end
end

describe '#extract_matches_per_platform' do
context 'with one link' do
text = 'https://twitter.com/karllorey/status/1259924082067374088'
extraction = described_class.new(text: text)
let(:matches) { extraction.extract_matches_per_platform }

it '#validate platform' do
expect(matches.keys).to eq [:twitter]
end

it '#validate nested named_captures for example status in twitter' do
expect(matches[:twitter].key?(:status)).to be true
end

it '#validate nested named_captures for example user in twitter' do
expect(matches[:twitter].key?(:user)).to be true
end

it '#validate matched text like input' do
expect(matches[:twitter][:status][0][:matched]).to eq text
end

it '#validate size of matching' do
expect(matches[:twitter].keys.length).to eq 2
end
end

context 'with different link with same platform' do
text = 'https://twitter.com/karllorey/status/1259924082067374088 \
https://twitter.com/karllorey12/status/12599240820673740883'
extraction = described_class.new(text: text)
matches = extraction.extract_matches_per_platform

it '#validate platform' do
expect(matches.keys).to eq [:twitter]
end

it '#validate nested named_captures for example status in twitter' do
expect(matches[:twitter].key?(:status)).to be true
end

it '#validate nested named_captures for example user in twitter' do
expect(matches[:twitter].key?(:user)).to be true
end

it '#validate size of matching with same platform with different link' do
expect(matches[:twitter].keys.length).to eq 2
end

it '#validate size of links with nested keys like #user in twitter with same platform with different link' do
expect(matches[:twitter][:user].length).to eq 2
end

it '#validate size of links with nested keys like #status in twitter with same platform with different link' do
expect(matches[:twitter][:status].length).to eq 2
end
end

context 'with multiple link with different platform' do
text = 'https://twitter.com/karllorey/status/1259924082067374088' \
'https://twitter.com/karllorey12/status/12599240820673740883' \
'http://crunchbase.com/organization/acme-corp [email protected] mailto:[email protected]' \
'https://facebook.com/peter.parker https://www.facebook.com/profile.php?id=100004123456789' \
'https://github.com/talaatmagdyx https://github.com/talaatmagdyx/socials_regex' \
'https://news.ycombinator.com/item?id=23290375 https://instagram.com/__disco__dude' \
'https://www.linkedin.com/in/talaatmagdyx/ https://medium.com/does-exist/some-post-123abc'
extraction = described_class.new(text: text)
matches = extraction.extract_matches_per_platform

it '#validate platform with multiple link with different platform' do
expect(matches.keys).to eq %i[crunchbase medium hackernews email instagram twitter
linkedin github facebook]
end

it '#validate nested named_captures for example status in twitter with multiple link with different platform' do
expect(matches[:twitter].key?(:status)).to be true
end

it '#validate nested named_captures for example user in twitter with multiple link with different platform' do
expect(matches[:twitter].key?(:user)).to be true
end

it '#validate size of matching with multiple link with different platform' do
expect(matches[:twitter].keys.length).to eq 2
end

it '#validate size of links with nested keys like #user in twitter with multiple link with different platform' do
expect(matches[:twitter][:user].length).to eq 2
end
end
end

describe '#extract_matches_by_regex' do
it 'extract data from string using regex' do
text = 'https://twitter.com/karllorey/status/1259924082067374088'
extraction = described_class.new(text: text)
matches = extraction.extract_matches_by_regex(regex: SocialsRegex::Regexes::TWITTER_URL_REGEX[:status])
expect(matches[0][:matched]).to eq text
end
end

describe '#extract_matches_by_platform' do
it 'extract data from string using platform' do
text = 'https://twitter.com/karllorey/status/1259924082067374088'
extraction = described_class.new(text: text)
matches = extraction.extract_matches_by_platform(platform: SocialsRegex::Platforms::PLATFORM_TWITTER)
expect(matches[SocialsRegex::Platforms::PLATFORM_TWITTER][:status][0][:matched]).to eq text
end
end
end

0 comments on commit ffd81fd

Please sign in to comment.