[이제와서 시작하는 Python 마스터하기 #6] 문자열 처리와 정규표현식

게시 2025/08/10

Python 마스터하기 시리즈

By YonYonWare

35 분읽는 시간

[이제와서 시작하는 Python 마스터하기 #6] 문자열 처리와 정규표현식

📧 5분만에 만드는 이메일 자동 처리 시스템

  
import re

def email_processor():
    """문자열과 정규표현식을 활용한 이메일 처리 시스템"""

    # 샘플 이메일 데이터
    emails = [
        "kim.python@samsung.com",
        "lee_dev@naver.com",
        "park.data@kakao.co.kr",
        "invalid.email@",
        "choi@lg.com",
        "jung.ai@sk-telecom.com",
        "not_an_email",
        "kang@daum.net"
    ]

    print("📧 이메일 처리 시스템\n")

    # 이메일 유효성 검사 패턴
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

    valid_emails = []
    invalid_emails = []
    company_stats = {}

    for email in emails:
        if re.match(email_pattern, email):
            valid_emails.append(email)

            # 회사별 분류
            domain = email.split('@')[1]
            company = domain.split('.')[0]

            # 대기업 매핑
            company_map = {
                'samsung': '삼성',
                'naver': '네이버',
                'kakao': '카카오',
                'lg': 'LG',
                'sk-telecom': 'SK텔레콤',
                'daum': '다음'
            }

            company_name = company_map.get(company, company)
            company_stats[company_name] = company_stats.get(company_name, 0) + 1

            # 이메일 마스킹
            username = email.split('@')[0]
            if len(username) > 3:
                masked = username[:2] + '*' * (len(username) - 3) + username[-1]
            else:
                masked = username[0] + '*' * (len(username) - 1)

            print(f"✅ 유효: {masked}@{domain}")
        else:
            invalid_emails.append(email)
            print(f"❌ 무효: {email}")

    # 통계
    print(f"\n📊 통계:")
    print(f"유효한 이메일: {len(valid_emails)}개")
    print(f"무효한 이메일: {len(invalid_emails)}개")

    print(f"\n🏢 회사별 분포:")
    for company, count in sorted(company_stats.items(), key=lambda x: x[1], reverse=True):
        print(f"  - {company}: {count}명")

    # 대량 메일 발송 테플릿 생성
    if valid_emails:
        template = """
📨 대량 메일 발송 테플릿

안녕하세요, 님!

에서 근무하시는 님께 특별 할인 혜택을 드립니다.
할인코드: 

감사합니다.
"""
        print("\n📨 샘플 테플릿 생성:")
        sample_email = valid_emails[0]
        name = sample_email.split('@')[0].split('.')[0].upper()
        company = sample_email.split('@')[1].split('.')[0].upper()
        discount_code = f"{company[:3]}{name[:3]}2024".upper()

        print(template.format(
            name=name,
            company=company,
            discount_code=discount_code
        ))

    return valid_emails, invalid_emails

# 실행
# email_processor()

📝 문자열(String) 완벽 마스터

문자열은 프로그래밍에서 가장 많이 다루는 데이터 타입 중 하나입니다. Python은 강력한 문자열 처리 기능을 제공합니다.

문자열 생성과 기본 조작

  
# 문자열 생성 방법들
single = 'Hello Python'
double = "Hello Python"
triple_single = '''여러 줄
문자열
가능'''
triple_double = """또 다른
여러 줄
문자열"""

# 이스케이프 시퀀스
escaped = "Line 1\nLine 2\tTabbed"
path = "C:\\Users\\Python"  # 백슬래시
raw_path = r"C:\Users\Python"  # raw string

# 문자열 연결
greeting = "Hello" + " " + "World"
repeated = "Python! " * 3  # "Python! Python! Python! "

# 문자열 인덱싱과 슬라이싱
text = "Python Programming"
print(text[0])      # 'P' (첫 번째 문자)
print(text[-1])     # 'g' (마지막 문자)
print(text[0:6])    # 'Python' (슬라이싱)
print(text[7:])     # 'Programming'
print(text[::-1])   # 'gnimmargorP nohtyP' (역순)

# 문자열은 불변(immutable)
# text[0] = 'J'  # TypeError!

[!TIP] 초보자를 위한 팁: 문자열은 수정할 수 없어요! (Immutable)

Python의 문자열은 한 번 만들어지면 내용을 바꿀 수 없습니다. text[0] = 'J' 처럼 직접 수정하려고 하면 에러가 납니다. 대신 new_text = 'J' + text[1:] 처럼 새로운 문자열을 만들어서 변수에 다시 저장해야 합니다.

문자열 메서드 - 검색과 확인

  
text = "Python is awesome and Python is powerful"

# 검색 메서드
print(text.find("Python"))     # 0 (첫 번째 위치)
print(text.rfind("Python"))    # 22 (마지막 위치)
print(text.find("Java"))       # -1 (없으면 -1)
print(text.index("awesome"))   # 10 (없으면 ValueError)
print(text.count("Python"))    # 2 (출현 횟수)

# 시작/끝 확인
print(text.startswith("Python"))  # True
print(text.endswith("powerful"))  # True
print(text.startswith(("Java", "Python")))  # True (튜플로 여러 개 확인)

# 문자열 타입 확인
print("Python3".isalnum())    # True (알파벳 또는 숫자)
print("Python".isalpha())     # True (알파벳만)
print("12345".isdigit())      # True (숫자만)
print("python".islower())     # True (소문자만)
print("PYTHON".isupper())     # True (대문자만)
print("   ".isspace())        # True (공백만)
print("Python Is Great".istitle())  # True (제목 형식)

문자열 메서드 - 변환과 수정

  
# 대소문자 변환
text = "Python Programming"
print(text.upper())       # PYTHON PROGRAMMING
print(text.lower())       # python programming
print(text.capitalize())  # Python programming
print(text.title())       # Python Programming
print(text.swapcase())    # pYTHON pROGRAMMING

# 공백 처리
text = "  Python  "
print(text.strip())       # "Python" (양쪽 공백 제거)
print(text.lstrip())      # "Python  " (왼쪽 공백 제거)
print(text.rstrip())      # "  Python" (오른쪽 공백 제거)
print("***Python***".strip("*"))  # "Python"

# 문자열 교체
text = "Hello World"
print(text.replace("World", "Python"))  # Hello Python
print(text.replace("l", "L", 2))       # HeLLo World (2개만 교체)

# 문자열 분할과 결합
text = "apple,banana,orange"
fruits = text.split(",")          # ['apple', 'banana', 'orange']
print("-".join(fruits))           # apple-banana-orange

# 여러 줄 문자열 처리
lines = """Line 1
Line 2
Line 3"""
print(lines.splitlines())         # ['Line 1', 'Line 2', 'Line 3']

# 정렬과 채우기
print("Python".center(20, "*"))   # *******Python*******
print("Python".ljust(20, "-"))    # Python--------------
print("Python".rjust(20, "-"))    # --------------Python
print("42".zfill(5))              # 00042

🎨 실무 예제: 영수증 자동 생성기

  
def generate_receipt():
    """문자열 포맷팅을 활용한 영수증 생성"""
    from datetime import datetime

    # 구매 데이터
    items = [
        {"name": "Galaxy S24 Ultra", "price": 1598000, "qty": 1},
        {"name": "Galaxy Buds Pro", "price": 219000, "qty": 2},
        {"name": "Galaxy Watch 6", "price": 449000, "qty": 1},
        {"name": "USB-C 케이블", "price": 25000, "qty": 3}
    ]

    customer = {
        "name": "김파이썬",
        "phone": "010-1234-5678",
        "membership": "VIP",
        "points": 15000
    }

    # 영수증 헤더
    print("=" * 50)
    print("{:^50}".format("🏪 SAMSUNG STORE"))
    print("=" * 50)
    print(f"\n고객명: {customer['name']:>10}")
    print(f"연락처: {customer['phone']:>10}")
    print(f"회원등급: {customer['membership']:>8}")
    print(f"\n구매일시: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("-" * 50)

    # 상품 목록
    print(f"{'\uc0c1\ud488\uba85':<20} {'\ub2e8\uac00':>12} {'\uc218\ub7c9':>5} {'\uae08\uc561':>12}")
    print("-" * 50)

    subtotal = 0
    for item in items:
        total = item['price'] * item['qty']
        subtotal += total
        # 상품명이 길면 줄임
        name = item['name'][:17] + "..." if len(item['name']) > 17 else item['name']
        print(f"{name:<20} {item['price']:>12,} {item['qty']:>5} {total:>12,}")

    # VIP 할인 적용
    discount_rate = 0.15 if customer['membership'] == 'VIP' else 0.10
    discount = int(subtotal * discount_rate)

    # 포인트 사용
    points_used = min(customer['points'], subtotal - discount)
    final_total = subtotal - discount - points_used

    # 결제 요약
    print("-" * 50)
    print(f"{'\ubd80\ubd84 \ud569\uacc4':>38} {subtotal:>12,}")
    print(f"{f'{customer["membership"]} \ud560\uc778 ({int(discount_rate*100)}%)':>38} {-discount:>12,}")
    if points_used > 0:
        print(f"{'\ud3ec\uc778\ud2b8 \uc0ac\uc6a9':>38} {-points_used:>12,}")
    print("=" * 50)
    print(f"{'\ucd5c\uc885 \uacb0\uc81c\uae08\uc561':>38} {final_total:>12,}")

    # 적립 포인트
    earned_points = int(final_total * 0.01)
    print(f"\n🎁 적립 포인트: {earned_points:,}P")
    print(f"💳 잔여 포인트: {customer['points'] - points_used + earned_points:,}P")

    # 바코드
    import hashlib
    receipt_id = hashlib.md5(f"{datetime.now()}{customer['name']}".encode()).hexdigest()[:12].upper()
    print(f"\n{'*' * 50}")
    print(f"{f'RECEIPT ID: {receipt_id}':^50}")
    print(f"{'*' * 50}")

    return receipt_id

# 실행
# generate_receipt()

문자열 포맷팅

  
# 1. % 포맷팅 (구식)
name = "김파이썬"
age = 25
print("이름: %s, 나이: %d" % (name, age))

# 2. str.format() 메서드
print("이름: {}, 나이: {}".format(name, age))
print("이름: {1}, 나이: {0}".format(age, name))  # 순서 지정
print("이름: {n}, 나이: {a}".format(n=name, a=age))  # 이름 지정

# 포맷 지정자
pi = 3.14159
print("{:.2f}".format(pi))        # 3.14 (소수점 2자리)
print("{:10}".format("Python"))   # Python     (10자리, 왼쪽 정렬)
print("{:>10}".format("Python"))  #     Python (오른쪽 정렬)
print("{:^10}".format("Python"))  #   Python   (가운데 정렬)
print("{:*^10}".format("Python")) # **Python** (채우기 문자)

# 3. f-string (Python 3.6+) - 권장!
name = "이영희"
score = 95.5
print(f"{name}님의 점수는 {score:.1f}점입니다")

# f-string 표현식
x, y = 10, 20
print(f"{x} + {y} = {x + y}")
print(f"{x} > {y}: {x > y}")

# f-string 정렬과 포맷
number = 42
print(f"{number:05d}")      # 00042
print(f"{number:>5}")       #    42
print(f"{pi:.2%}")          # 314.16% (백분율)
print(f"{1000000:,}")       # 1,000,000 (천 단위 구분)

# f-string 디버깅 (Python 3.8+)
variable = "디버깅"
print(f"{variable=}")       # variable='디버깅'

[!TIP] f-string을 적극 활용하세요!

Python 3.6부터 도입된 f-string은 가장 빠르고, 읽기 쉽고, 강력한 문자열 포맷팅 방식입니다. % 연산자나 .format() 메서드보다 f"{변수}" 방식을 우선적으로 사용하세요. 코드가 훨씬 깔끔해집니다!

🔍 정규표현식 (Regular Expression)

정규표현식은 문자열 패턴을 표현하는 강력한 도구입니다.

정규표현식 기초

  
import re

# 기본 패턴 매칭
text = "The phone number is 010-1234-5678"

# search: 첫 번째 매치 찾기
pattern = r"\d{3}-\d{4}-\d{4}"
match = re.search(pattern, text)
if match:
    print(match.group())  # 010-1234-5678
    print(match.start())  # 시작 위치
    print(match.end())    # 끝 위치

# findall: 모든 매치 찾기
text = "Email: user@example.com, admin@test.org"
emails = re.findall(r"\w+@\w+\.\w+", text)
print(emails)  # ['user@example.com', 'admin@test.org']

# match: 문자열 시작부터 매치
print(re.match(r"The", text))    # 매치 객체
print(re.match(r"phone", text))  # None (시작이 아님)

[!TIP] Raw String (r”…“)의 비밀

정규표현식을 쓸 때는 항상 r"패턴" 처럼 앞에 r을 붙이세요. r은 “Raw String”의 약자로, 백슬래시(\)를 이스케이프 문자가 아닌 일반 문자로 취급하겠다는 뜻입니다. 이걸 안 쓰면 \\d 처럼 백슬래시를 두 번씩 써야 해서 코드가 지저분해집니다.

정규표현식 패턴

  
# 메타문자
# . : 아무 문자 하나
# ^ : 문자열 시작
# $ : 문자열 끝
# * : 0개 이상 반복
# + : 1개 이상 반복
# ? : 0개 또는 1개
# {n} : 정확히 n개
# {n,} : n개 이상
# {n,m} : n개 이상 m개 이하
# [] : 문자 클래스
# | : OR
# () : 그룹

# 문자 클래스
# \d : 숫자 [0-9]
# \D : 숫자가 아닌 것
# \w : 단어 문자 [a-zA-Z0-9_]
# \W : 단어 문자가 아닌 것
# \s : 공백 문자
# \S : 공백이 아닌 문자

# 예제들
patterns = {
    "숫자": r"\d+",
    "영문자": r"[a-zA-Z]+",
    "한글": r"[가-힣]+",
    "이메일": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "URL": r"https?://(?:www\.)?[a-zA-Z0-9./]+",
    "전화번호": r"0\d{1,2}-\d{3,4}-\d{4}",
    "주민번호": r"\d{6}-[1-4]\d{6}",
    "IP주소": r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
}

test_text = """
연락처: 010-1234-5678
이메일: test@example.com
웹사이트: https://www.example.com
"""

for name, pattern in patterns.items():
    matches = re.findall(pattern, test_text)
    if matches:
        print(f"{name}: {matches}")

정규표현식 고급 기능

  
# 그룹과 캡처
text = "홍길동(30세), 김철수(25세)"
pattern = r"(\w+)\((\d+)세\)"
matches = re.findall(pattern, text)
print(matches)  # [('홍길동', '30'), ('김철수', '25')]

# 명명된 그룹
pattern = r"(?P<name>\w+)\((?P<age>\d+)세\)"
for match in re.finditer(pattern, text):
    print(f"이름: {match.group('name')}, 나이: {match.group('age')}")

# 치환 (substitution)
text = "오늘은 2024-03-15입니다"
# 날짜 형식 변경
result = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\3/\2/\1", text)
print(result)  # 오늘은 15/03/2024입니다

# 함수를 사용한 치환
def replace_date(match):
    year, month, day = match.groups()
    return f"{day}일 {month}월 {year}년"

result = re.sub(r"(\d{4})-(\d{2})-(\d{2})", replace_date, text)
print(result)  # 오늘은 15일 03월 2024년입니다

# 플래그 사용
# re.IGNORECASE (re.I) : 대소문자 무시
# re.MULTILINE (re.M) : 여러 줄 모드
# re.DOTALL (re.S) : .이 줄바꿈도 매치
# re.VERBOSE (re.X) : 주석과 공백 허용

text = "Python python PYTHON"
print(re.findall(r"python", text, re.I))  # ['Python', 'python', 'PYTHON']

# 정규표현식 컴파일
pattern = re.compile(r"\d{3}-\d{4}-\d{4}")
phone_numbers = ["010-1234-5678", "02-123-4567", "invalid"]
for number in phone_numbers:
    if pattern.match(number):
        print(f"유효한 전화번호: {number}")

[!WARNING] 정규표현식은 만능이 아닙니다!

정규표현식은 강력하지만, 너무 복잡하게 짜면 “작성한 사람도 못 알아보는” 암호가 되어버립니다. 패턴이 너무 복잡해진다면, 차라리 Python 코드로 문자열을 자르고(split) 검사하는 것이 더 나을 수도 있습니다. 가독성이 항상 최우선입니다!

실용적인 정규표현식 패턴

  
class TextValidator:
    """텍스트 유효성 검사 클래스"""
    
    @staticmethod
    def is_email(text):
        """이메일 형식 검사"""
        pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
        return bool(re.match(pattern, text))
    
    @staticmethod
    def is_korean_phone(text):
        """한국 전화번호 형식 검사"""
        pattern = r"^0\d{1,2}-\d{3,4}-\d{4}$"
        return bool(re.match(pattern, text))
    
    @staticmethod
    def is_password_strong(password):
        """강한 비밀번호 검사 (8자 이상, 대소문자, 숫자, 특수문자 포함)"""
        if len(password) < 8:
            return False
        
        checks = [
            r"[a-z]",      # 소문자
            r"[A-Z]",      # 대문자
            r"\d",         # 숫자
            r"[!@#$%^&*]"  # 특수문자
        ]
        
        return all(re.search(check, password) for check in checks)
    
    @staticmethod
    def extract_hashtags(text):
        """해시태그 추출"""
        return re.findall(r"#\w+", text)
    
    @staticmethod
    def extract_mentions(text):
        """멘션 추출"""
        return re.findall(r"@\w+", text)
    
    @staticmethod
    def clean_text(text):
        """텍스트 정리 (특수문자 제거, 공백 정리)"""
        # 특수문자 제거 (한글, 영문, 숫자, 공백만 남김)
        text = re.sub(r"[^가-힣a-zA-Z0-9\s]", "", text)
        # 여러 공백을 하나로
        text = re.sub(r"\s+", " ", text)
        return text.strip()

# 사용 예시
validator = TextValidator()

# 이메일 검증
emails = ["user@example.com", "invalid.email", "test@sub.domain.com"]
for email in emails:
    print(f"{email}: {'유효' if validator.is_email(email) else '무효'}")

# 비밀번호 강도 검사
passwords = ["weak", "Weak123", "Strong@123"]
for pwd in passwords:
    print(f"{pwd}: {'강함' if validator.is_password_strong(pwd) else '약함'}")

# 소셜 미디어 텍스트 처리
social_text = "오늘 #파이썬 공부 완료! @friend1 @friend2 함께 해요 #프로그래밍 #코딩"
print(f"해시태그: {validator.extract_hashtags(social_text)}")
print(f"멘션: {validator.extract_mentions(social_text)}")

💡 실전 예제

1. 로그 파일 분석기

  
import re
from datetime import datetime
from collections import defaultdict

class LogAnalyzer:
    """웹 서버 로그 분석기"""
    
    def __init__(self):
        # Apache/Nginx 로그 패턴
        self.log_pattern = re.compile(
            r'(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] "(.*?)" (\d+) (\d+)'
        )
        self.stats = defaultdict(int)
        self.ip_stats = defaultdict(int)
        self.status_stats = defaultdict(int)
        self.url_stats = defaultdict(int)
    
    def parse_log_line(self, line):
        """로그 라인 파싱"""
        match = self.log_pattern.match(line)
        if match:
            ip, timestamp, request, status, size = match.groups()
            
            # 요청에서 메서드와 URL 추출
            request_match = re.match(r'(\w+) (.*?) HTTP', request)
            if request_match:
                method, url = request_match.groups()
            else:
                method, url = "UNKNOWN", request
            
            return {
                "ip": ip,
                "timestamp": timestamp,
                "method": method,
                "url": url,
                "status": int(status),
                "size": int(size)
            }
        return None
    
    def analyze_logs(self, log_lines):
        """로그 분석"""
        for line in log_lines:
            parsed = self.parse_log_line(line)
            if parsed:
                # IP별 접속 횟수
                self.ip_stats[parsed["ip"]] += 1
                
                # 상태 코드별 횟수
                self.status_stats[parsed["status"]] += 1
                
                # URL별 접속 횟수
                self.url_stats[parsed["url"]] += 1
                
                # 전체 통계
                self.stats["total_requests"] += 1
                self.stats["total_bytes"] += parsed["size"]
    
    def get_top_ips(self, n=10):
        """상위 IP 주소"""
        return sorted(self.ip_stats.items(), 
                     key=lambda x: x[1], reverse=True)[:n]
    
    def get_error_rate(self):
        """에러율 계산"""
        total = self.stats["total_requests"]
        if total == 0:
            return 0
        
        errors = sum(count for status, count in self.status_stats.items() 
                    if status >= 400)
        return (errors / total) * 100
    
    def get_report(self):
        """분석 리포트 생성"""
        report = f"""
=== 로그 분석 리포트 ===
총 요청 수: {self.stats['total_requests']:,}
총 전송 바이트: {self.stats['total_bytes']:,}
평균 응답 크기: {self.stats['total_bytes'] / max(1, self.stats['total_requests']):.0f} bytes
에러율: {self.get_error_rate():.2f}%

상태 코드 분포:
"""
        for status, count in sorted(self.status_stats.items()):
            report += f"  {status}: {count:,}회\n"
        
        report += "\n가장 많이 방문한 IP (Top 5):\n"
        for ip, count in self.get_top_ips(5):
            report += f"  {ip}: {count:,}회\n"
        
        report += "\n가장 많이 요청된 URL (Top 5):\n"
        url_top = sorted(self.url_stats.items(), 
                        key=lambda x: x[1], reverse=True)[:5]
        for url, count in url_top:
            report += f"  {url}: {count:,}회\n"
        
        return report

# 사용 예시
sample_logs = [
    '192.168.1.1 - - [01/Mar/2024:10:15:30 +0900] "GET /index.html HTTP/1.1" 200 5132',
    '192.168.1.2 - - [01/Mar/2024:10:15:31 +0900] "POST /api/login HTTP/1.1" 200 234',
    '192.168.1.1 - - [01/Mar/2024:10:15:32 +0900] "GET /style.css HTTP/1.1" 304 0',
    '192.168.1.3 - - [01/Mar/2024:10:15:33 +0900] "GET /api/users HTTP/1.1" 401 45',
    '192.168.1.1 - - [01/Mar/2024:10:15:34 +0900] "GET /favicon.ico HTTP/1.1" 404 0',
]

analyzer = LogAnalyzer()
analyzer.analyze_logs(sample_logs)
print(analyzer.get_report())

2. 텍스트 템플릿 엔진

  
import re

class SimpleTemplateEngine:
    """간단한 템플릿 엔진"""
    
    def __init__(self):
        self.filters = {
            "upper": str.upper,
            "lower": str.lower,
            "title": str.title,
            "strip": str.strip,
            "capitalize": str.capitalize
        }
    
    def render(self, template, context):
        """템플릿 렌더링"""
        # 변수 치환: 
        def replace_variable(match):
            var_name = match.group(1).strip()
            
            # 필터 처리: 
            if "|" in var_name:
                var_name, filter_name = var_name.split("|", 1)
                var_name = var_name.strip()
                filter_name = filter_name.strip()
                
                value = context.get(var_name, "")
                if filter_name in self.filters:
                    value = self.filters[filter_name](str(value))
                return str(value)
            
            # 중첩 접근: 
            if "." in var_name:
                parts = var_name.split(".")
                value = context
                for part in parts:
                    if isinstance(value, dict):
                        value = value.get(part, "")
                    else:
                        value = ""
                        break
                return str(value)
            
            return str(context.get(var_name, ""))
        
        # 조건문 처리: {​% if condition %} ... {​% endif %}
        def process_conditionals(text):
            pattern = r"{\%\s*if\s+(\w+)\s*\%}(.*?){\%\s*endif\s*\%}"
            
            def replace_conditional(match):
                condition = match.group(1)
                content = match.group(2)
                
                if context.get(condition):
                    return content
                return ""
            
            return re.sub(pattern, replace_conditional, text, flags=re.DOTALL)
        
        # 반복문 처리: {​% for item in items %} ... {​% endfor %}
        def process_loops(text):
            pattern = r"{\%\s*for\s+(\w+)\s+in\s+(\w+)\s*\%}(.*?){\%\s*endfor\s*\%}"
            
            def replace_loop(match):
                item_name = match.group(1)
                list_name = match.group(2)
                content = match.group(3)
                
                items = context.get(list_name, [])
                result = []
                
                for item in items:
                    # 임시 컨텍스트 생성
                    temp_context = context.copy()
                    temp_context[item_name] = item
                    
                    # 내부 변수 치환
                    item_content = re.sub(
                        r"\{\{\s*([^}]+)\s*\}\}", 
                        lambda m: str(temp_context.get(m.group(1).strip(), "")),
                        content
                    )
                    result.append(item_content)
                
                return "".join(result)
            
            return re.sub(pattern, replace_loop, text, flags=re.DOTALL)
        
        # 처리 순서: 조건문 → 반복문 → 변수
        result = template
        result = process_conditionals(result)
        result = process_loops(result)
        result = re.sub(r"\{\{\s*([^}]+)\s*\}\}", replace_variable, result)
        
        return result
    
    def add_filter(self, name, func):
        """커스텀 필터 추가"""
        self.filters[name] = func

# 사용 예시
engine = SimpleTemplateEngine()

# 간단한 템플릿
template1 = """
안녕하세요, 님!
오늘은 입니다.
"""

context1 = {
    "name": "김파이썬",
    "date": "2024년 3월 15일"
}

print(engine.render(template1, context1))

# 조건문과 반복문이 있는 템플릿
template2 = """
=== 사용자 정보 ===
이름: 
이메일: 

{​% if is_admin %}
[관리자 권한이 있습니다]
{​% endif %}

=== 주문 목록 ===
{​% for order in orders %}
-  (원)
{​% endfor %}

총 주문 금액: 원
"""

context2 = {
    "user": {
        "name": "이영희",
        "email": "lee@example.com"
    },
    "is_admin": True,
    "orders": [
        {"product": "노트북", "price": 1200000},
        {"product": "마우스", "price": 50000},
        {"product": "키보드", "price": 80000}
    ],
    "total_price": 1330000
}

print(engine.render(template2, context2))

# 커스텀 필터 추가
engine.add_filter("currency", lambda x: f"{int(x):,}원")

3. 마크다운 파서 (간단 버전)

  
import re

class SimpleMarkdownParser:
    """간단한 마크다운 파서"""
    
    def __init__(self):
        self.rules = [
            # 헤더
            (r"^#{6}\s+(.+)$", r"<h6>\1</h6>"),
            (r"^#{5}\s+(.+)$", r"<h5>\1</h5>"),
            (r"^#{4}\s+(.+)$", r"<h4>\1</h4>"),
            (r"^#{3}\s+(.+)$", r"<h3>\1</h3>"),
            (r"^#{2}\s+(.+)$", r"<h2>\1</h2>"),
            (r"^#\s+(.+)$", r"<h1>\1</h1>"),
            
            # 굵은 글씨와 기울임
            (r"\*\*(.+?)\*\*", r"<strong>\1</strong>"),
            (r"__(.+?)__", r"<strong>\1</strong>"),
            (r"\*(.+?)\*", r"<em>\1</em>"),
            (r"_(.+?)_", r"<em>\1</em>"),
            
            # 인라인 코드
            (r"`(.+?)`", r"<code>\1</code>"),
            
            # 링크
            (r"\[([^\]]+)\]\(([^)]+)\)", r'<a href="\2">\1</a>'),
            
            # 이미지
            (r"!\[([^\]]*)\]\(([^)]+)\)", r'<img src="\2" alt="\1">'),
            
            # 수평선
            (r"^---+$", r"<hr>"),
            (r"^\*\*\*+$", r"<hr>"),
            
            # 줄바꿈
            (r"  $", r"<br>"),
        ]
        
        self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$", re.MULTILINE)
        self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$", re.MULTILINE)
        self.code_block_pattern = re.compile(r"```(.*?)\n(.*?)```", re.DOTALL)
        self.blockquote_pattern = re.compile(r"^>\s+(.+)$", re.MULTILINE)
    
    def parse(self, markdown_text):
        """마크다운을 HTML로 변환"""
        html = markdown_text
        
        # 코드 블록 처리 (먼저 처리하여 내부 내용 보호)
        def replace_code_block(match):
            language = match.group(1) or "text"
            code = match.group(2)
            # HTML 이스케이프
            code = code.replace("&", "&amp;")
            code = code.replace("<", "&lt;")
            code = code.replace(">", "&gt;")
            return f'<pre><code class="language-{language}">{code}</code></pre>'
        
        html = self.code_block_pattern.sub(replace_code_block, html)
        
        # 블록 인용 처리
        html = self.blockquote_pattern.sub(r"<blockquote>\1</blockquote>", html)
        
        # 순서 없는 리스트 처리
        def process_unordered_list(text):
            lines = text.split("\n")
            result = []
            in_list = False
            current_indent = 0
            
            for line in lines:
                match = self.list_pattern.match(line)
                if match:
                    indent = len(match.group(1))
                    content = match.group(2)
                    
                    if not in_list:
                        result.append("<ul>")
                        in_list = True
                        current_indent = indent
                    elif indent > current_indent:
                        result.append("<ul>")
                        current_indent = indent
                    elif indent < current_indent:
                        result.append("</ul>")
                        current_indent = indent
                    
                    result.append(f"<li>{content}</li>")
                else:
                    if in_list:
                        result.append("</ul>" * ((current_indent // 2) + 1))
                        in_list = False
                        current_indent = 0
                    result.append(line)
            
            if in_list:
                result.append("</ul>")
            
            return "\n".join(result)
        
        html = process_unordered_list(html)
        
        # 인라인 규칙 적용
        for pattern, replacement in self.rules:
            html = re.sub(pattern, replacement, html, flags=re.MULTILINE)
        
        # 단락 처리
        paragraphs = html.split("\n\n")
        processed_paragraphs = []
        
        for para in paragraphs:
            para = para.strip()
            if para and not para.startswith("<"):
                para = f"<p>{para}</p>"
            processed_paragraphs.append(para)
        
        return "\n\n".join(processed_paragraphs)

# 사용 예시
parser = SimpleMarkdownParser()

markdown = """
# 마크다운 테스트

이것은 **굵은 글씨**이고 이것은 *기울임 글씨*입니다.

## 리스트 예제

- 첫 번째 항목
- 두 번째 항목
  - 중첩된 항목
  - 또 다른 중첩 항목
- 세 번째 항목

## 코드 예제

인라인 코드: `print("Hello, World!")`

```python
def hello():
    print("Hello, World!")

링크와 이미지

Python 공식 사이트

이것은 블록 인용입니다.

마지막 단락입니다. “””

html = parser.parse(markdown) print(html)

## ⚠️ 초보자가 자주 하는 실수

### 1. 문자열 수정 실수
```python
# ❌ 문자열은 불변(immutable)
text = "Python"
# text[0] = "J"  # TypeError!

# ✅ 새로운 문자열 생성
text = "Python"
text = "J" + text[1:]  # "Jython"
# 또는
text = text.replace("P", "J")

2. + 연산자 남용

  
# ❌ 반복문에서 + 사용 (비효율적)
result = ""
for i in range(10000):
    result += str(i)  # 매번 새 문자열 생성

# ✅ join() 사용
result = "".join(str(i) for i in range(10000))

# ✅ 또는 리스트 사용
parts = []
for i in range(10000):
    parts.append(str(i))
result = "".join(parts)

3. 정규식 실수

  
# ❌ 특수문자를 이스케이프하지 않음
pattern = "3.14"  # . 은 모든 문자를 의미
re.match(pattern, "3x14")  # 매치됨!

# ✅ 이스케이프 처리
pattern = r"3\.14"  # 또는 "3\\.14"
re.match(pattern, "3.14")  # 올바른 매치

4. 인코딩 문제

  
# ❌ 인코딩 명시하지 않음
with open("korean.txt", "r") as f:  # 한글 깨짐 가능
    content = f.read()

# ✅ UTF-8 인코딩 명시
with open("korean.txt", "r", encoding="utf-8") as f:
    content = f.read()

5. strip() vs replace()

  
# ❌ strip()으로 중간 공백 제거 시도
text = "Hello  World"
result = text.strip()  # "Hello  World" - 중간 공백 유지

# ✅ replace() 사용
text = "Hello  World"
result = text.replace(" ", "")  # "HelloWorld"
# 또는 정규식
result = re.sub(r"\s+", " ", text)  # "Hello World"

🎯 핵심 정리

문자열 처리 Best Practices

  
# 1. 문자열 연결은 join() 사용
# 좋음
parts = ["Hello", "World", "from", "Python"]
result = " ".join(parts)

# 나쁨 (반복적인 + 연산)
result = ""
for part in parts:
    result += part + " "

# 2. 문자열 포맷팅은 f-string 사용
name = "Python"
version = 3.12
# 좋음
message = f"{name} {version}"
# 나쁨
message = "%s %s" % (name, version)

# 3. 대소문자 구분 없는 비교
if text.lower() == "python":
    pass

# 4. 문자열이 비어있는지 확인
if not text:  # 빈 문자열은 False
    print("텍스트가 비어있습니다")

정규표현식 Best Practices

  
# 1. 복잡한 패턴은 컴파일하여 재사용
email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
if email_pattern.match(email):
    pass

# 2. raw string 사용 (백슬래시 이스케이프 방지)
pattern = r"\d{4}-\d{2}-\d{2}"  # 좋음
pattern = "\\d{4}-\\d{2}-\\d{2}"  # 나쁨

# 3. 명명된 그룹으로 가독성 향상
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"

# 4. 복잡한 패턴은 주석과 함께
pattern = re.compile(r"""
    ^                    # 시작
    (?P<protocol>https?) # 프로토콜
    ://                  # 구분자
    (?P<domain>[^/]+)    # 도메인
    (?P<path>/.*)?       # 경로 (선택적)
    $                    # 끝
""", re.VERBOSE)

성능 고려사항

graph TD
    A[문자열 처리 성능] --> B[연결: join > +=]
    A --> C[검색: in > find > regex]
    A --> D[변환: translate > replace]
    A --> E[포맷: f-string > format > %]
    
    F[정규표현식 성능] --> G[컴파일된 패턴 사용]
    F --> H[구체적인 패턴 작성]
    F --> I[불필요한 그룹 최소화]
    F --> J[탐욕적 vs 게으른 수량자]

🎓 파이썬 마스터하기 시리즈

📚 기초편 (1-7)

🚀 중급편 (8-12)

💼 고급편 (13-16)

이전글: 리스트, 튜플, 딕셔너리 정복하기 ⬅️ 현재글: 문자열 처리와 정규표현식 다음글: 파일 입출력과 예외 처리 ➡️

이번 포스트에서는 문자열 처리의 다양한 방법과 강력한 정규표현식을 학습했습니다. 다음 포스트에서는 파일 입출력과 예외 처리에 대해 자세히 알아보겠습니다. Happy Coding! 🐍✨

Programming, Python