'html parse' 태그의 글 목록

html parse

WebBrowser이용한 스크린스크래핑 2018.05.09

WebBrowser이용한 스크린스크래핑

2018. 5. 9. 09:09

스크린스크래핑

WebBrowser이용해서 website에 렌더링된 html의 특정값을 추출한다.

예를 들어 특정 사이트 모 카테고리에 있는 내용을 모두 가져와서 데이터화하고 그것을 분석하고 싶을 때 스크래핑을 사용하면 편리하다.

아래 code는

특정 카테고리의의 글list를 통해 key, title을 추출하고

Key를 이용해 본문내용의 text를 가져와서 db화 한 내용이다.

글의 pattern등 분석자료로 사용할 목적으로 데이터가 필요하나 해당사이트에서 DB를공개하진 않으니 이렇게 스크랩핑해서 자료화해서 사용하는 것이다.

이렇게 하지 않으면 사이트의 모든 사이트 내용을 클릭~클릭~클릭~ 하겠지.

이렇게 프로그램하기 전에 사전에 해당 사이트를 분석하고 맞게 프로그래밍하면 된다.

step1) 카테고리의의 글list를 통해 key, title을 추출 : webSiteContentsList();

=> 추출된 data를 수작업으로 insert스크립트만들어 TMP_HELP 에 insert함.

step2) Key를 이용해 본문내용의 text를 가져와서 db화 한 내용 : webSiteContentsDetail(oBrowser);

<c# code>

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using MySql.Data.MySqlClient;

namespace ScrappingWinForms
{
    public partial class Form1 : Form
    {

        int iq = 0;
        string contentsKey = "";

        string sConnectionString = "Server=localhost;contentsKey=root;Pwd=???;Database=??????";
        string sContentsUrl = "http://www.doitforyou.co.kr/sread.html?key="; //fake url임
        string sListUrl = "http://www.doitforyou.co.kr/sub.html?&sec=cc&section=culture&page="; //fake url임

        String Url = string.Empty;

        public Form1()
        {
            InitializeComponent();

            //WebBrowser로 사이트 로딩시 자바스크립트 바인딩안되어 에러나는 경우.
            //자바스크립트 오류(dialogbox) 무시하고 진행가능하도록 제어함. 
            this.webBrowserViewer.ScriptErrorsSuppressed = true;
            webBrowserViewer.DocumentTitleChanged += new EventHandler(webBrowserViewer_DocumentTitleChanged);
        }
        
        private void btnSearch_Click(object sender, EventArgs e)
        {
            // surftheWeb();
            if (iq < 200)
            {
                selectContentsKeyfromWebsite();
            }
        }

        #region surftheWeb :WebBrowser에 SitePage 브라우징한다.
        
        private void surftheWeb()
        {
            Url = txtUrl.Text;
            webBrowserViewer.Navigate(Url);
        }

        private void surftheWeb(String sUrl)
        {
            webBrowserViewer.Navigate(sUrl);
        }

        #endregion


        #region Select / Update from the WebSite

        private void selectContentsKeyfromWebsite()
        {
            MySqlConnection oConnection = new MySqlConnection(sConnectionString);
            oConnection.Open();

            try
            {
                MySqlCommand cmd = oConnection.CreateCommand();
                cmd.CommandText = "SELECT HELP_TOPIC_ID FROM TMP_HELP WHERE HELP_TOPIC_ID >= 100 AND DESCRIPTION = '' LIMIT 1";
                MySqlDataAdapter aDap = new MySqlDataAdapter(cmd);
                DataSet dsData = new DataSet();
                aDap.Fill(dsData);
                
                if (contentsKey != dsData.Tables[0].Rows[0][0].ToString())
                {
                    contentsKey = dsData.Tables[0].Rows[0][0].ToString();
                    String sUrl = String.Empty;
                    sUrl = sContentsUrl + contentsKey;
                    surftheWeb(sUrl);
                }
            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                if (oConnection.State == ConnectionState.Open)
                {
                    oConnection.Close();
                }
            }
        }
              
        /// <summary>
        /// key값으로 contents내용을 업데이트 한다.
        /// </summary>
        /// <param name="sHelp_topic_id"></param>
        /// <param name="sDescription"></param>
        private void updateContentonIldaro(String sHelp_topic_id, String sDescription)
        {
            MySqlConnection oConnection = new MySqlConnection(sConnectionString);
            oConnection.Open();

            try
            {
                MySqlCommand cmd = oConnection.CreateCommand();
                cmd.CommandText = "UPDATE tmp_help set description = @description where help_topic_id = @help_topic_id";
                
                cmd.Parameters.AddWithValue("@help_topic_id", int.Parse(sHelp_topic_id));
                cmd.Parameters.AddWithValue("@description", sDescription);
                cmd.ExecuteNonQuery();
            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                if (oConnection.State == ConnectionState.Open)
                {
                    oConnection.Close();

                    //key를 다시 조회한다.
                    btnSearch_Click(null, null);
                }
            }
        }
        #endregion

        /// <summary>
        /// WebBrowser's DocumentCompleted Event
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void webBrowserViewer_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            WebBrowser oBrowser = sender as WebBrowser;
            //txtSource.Text = StreamConvertEUCKRtoUTF8(oBrowser);

            this.txtUrl.Text = oBrowser.Url.AbsoluteUri;

            //webSiteContentsList();
            webSiteContentsDetail(oBrowser);   
        }        

        private void webSiteContentsDetail(WebBrowser oBrowser)
        {                
            iq++; //반복수행위한 카운팅 
            getWebSiteContentsDetail(oBrowser);
        }

        /// <summary>
        /// 게시판 글list, Paging있음
        /// </summary>
        private void webSiteContentsList() {

            iq++; //paging 숫자

            getData();

            if (iq < 21)
            {
               String sUrl = sListUrl  + iq.ToString();
               surftheWeb(sUrl);
            }
        }

        /// <summary>
        /// 1.WebBrowser로 rendering된 html을 TagName으로 HtmlElementCollection에 담는다.
        /// 2.HtmlElementCollection => HtmlElement에 담긴 InnerHtml분석한다.
        /// 3.HtmlElement[]에 추출하려는 대상을 담는다.
        /// </summary>
        /// <param name="wb"></param>
        /// <param name="tagName"></param>
        /// <returns></returns>
        private HtmlElement[] GetElementsByTagName(WebBrowser wb, string tagName)
        {
            var l = new List<HtmlElement>();

            var els = wb.Document.GetElementsByTagName(tagName); // all elems with tag
            foreach (HtmlElement el in els)
            {
                 if (el.InnerHtml == null)
                {
                   continue;
                }

               if(el.InnerHtml.StartsWith("<A href=\"read.html"))
                {
                    l.Add(el);
                }
            }

            return l.ToArray();
        }


        /// <summary>
        /// 1.WebBrowser로 rendering된 html을 TagName으로 HtmlElementCollection에 담는다.
        /// 2.HtmlElementCollection => HtmlElement에 담긴 요소들중 className을 이용해서 추출하려는 대상을 뽑는다..
        /// 3.HtmlElement[]에 추출하려는 대상을 담는다.
        /// </summary>
        /// <param name="wb"></param>
        /// <param name="tagName"></param>
        /// <param name="className"></param>
        /// <returns></returns>
        private HtmlElement[] GetElementsByTagNClassName(WebBrowser wb, string tagName, string className)
        {
            var l = new List<HtmlElement>();

            var els = wb.Document.GetElementsByTagName(tagName); // all elems with tag
            foreach (HtmlElement el in els)
            {

                if (el.GetAttribute("className") == className)
                {
                    l.Add(el);
                }
            }

            return l.ToArray();
        }

        /// <summary>
        /// 글 상세 페이지에서 추출하고자 하는 Text만 가져온다.
        /// </summary>
        /// <param name="oBrowser"></param>
        private void getWebSiteContentsDetail(WebBrowser oBrowser)
        {
            var arrCollection = GetElementsByTagNClassName(webBrowserViewer, "td", "contentsbody");
            for (int i = 0; i < arrCollection.Length; i++)
            {
                //Console.WriteLine(arrCollection[i].InnerHtml);

                String sText = String.Empty;
                //sText = arrCollection[i].InnerHtml;
                sText = arrCollection[i].OuterText;

                if (sText == null)
                {
                    //글 text가 없는 경우가 있음
                    //case1) 이미지로 대체한 경우 img html tag가 글 대신 있음
                    sText = arrCollection[i].InnerHtml;
                }
                
                if (contentsKey.Equals(oBrowser.Url.AbsoluteUri.Replace(sContentsUrl, "")))
                {
                    updateContentonIldaro(contentsKey, sText);
                }
            }
        }

        private void getData() {

            // getting day and night temperature
            var arrCollection = GetElementsByTagName(webBrowserViewer, "dt");

            DataTable dt = new DataTable();

            dt.Columns.Add("contentsKey", typeof(int));
            dt.Columns.Add("Title", typeof(string));

            Console.WriteLine("-- " + iq.ToString() + " --------------");
            txtSource.Text = txtSource.Text + "-- " + iq.ToString() + " --------------" + "\r\n";

            for (int i = 0; i < arrCollection.Length; i++) {
                //Console.WriteLine(arrCollection[i].InnerHtml);

                String sText = String.Empty;
                sText = arrCollection[i].InnerHtml;
                sText = sText.Replace("<A href=\"sread.html?key=", "").Replace("</A>", "").Replace("\"", "");

                String[] arrSplit = sText.Split('>');

                dt.Rows.Add(arrSplit[0],arrSplit[1]);

             
                Console.WriteLine(arrSplit[0] + ":" + arrSplit[1]);

                txtSource.Text = txtSource.Text + arrSplit[0] + ":" + arrSplit[1] + "\r\n";

                
            }

        }

        #endregion

reference to : https://www.codeproject.com/Tips/858775/Csharp-Website-HTML-Content-Parsing-or-How-To-Get

저작자표시 비영리 변경금지

'프로그래밍 > c#' 카테고리의 다른 글

c#에서 mariadb사용하기 (0)	2018.11.10
WebBrowser.DocumentText 한글깨짐 (0)	2018.05.07
using MySqlConnector (0)	2018.05.07

PREV 1 NEXT

Do It For You

html parse

WebBrowser이용한 스크린스크래핑

'프로그래밍 > c#' 카테고리의 다른 글

+ Recent posts

티스토리툴바