using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using MySql.Data.MySqlClient;
namespace ScrappingWinForms
{
public partial class Form1 : Form
{
int iq = 0;
string contentsKey = "";
string sConnectionString = "Server=localhost;contentsKey=root;Pwd=???;Database=??????";
string sContentsUrl = "http://www.doitforyou.co.kr/sread.html?key="; //fake url임
string sListUrl = "http://www.doitforyou.co.kr/sub.html?&sec=cc§ion=culture&page="; //fake url임
String Url = string.Empty;
public Form1()
{
InitializeComponent();
//WebBrowser로 사이트 로딩시 자바스크립트 바인딩안되어 에러나는 경우.
//자바스크립트 오류(dialogbox) 무시하고 진행가능하도록 제어함.
this.webBrowserViewer.ScriptErrorsSuppressed = true;
webBrowserViewer.DocumentTitleChanged += new EventHandler(webBrowserViewer_DocumentTitleChanged);
}
private void btnSearch_Click(object sender, EventArgs e)
{
// surftheWeb();
if (iq < 200)
{
selectContentsKeyfromWebsite();
}
}
#region surftheWeb :WebBrowser에 SitePage 브라우징한다.
private void surftheWeb()
{
Url = txtUrl.Text;
webBrowserViewer.Navigate(Url);
}
private void surftheWeb(String sUrl)
{
webBrowserViewer.Navigate(sUrl);
}
#endregion
#region Select / Update from the WebSite
private void selectContentsKeyfromWebsite()
{
MySqlConnection oConnection = new MySqlConnection(sConnectionString);
oConnection.Open();
try
{
MySqlCommand cmd = oConnection.CreateCommand();
cmd.CommandText = "SELECT HELP_TOPIC_ID FROM TMP_HELP WHERE HELP_TOPIC_ID >= 100 AND DESCRIPTION = '' LIMIT 1";
MySqlDataAdapter aDap = new MySqlDataAdapter(cmd);
DataSet dsData = new DataSet();
aDap.Fill(dsData);
if (contentsKey != dsData.Tables[0].Rows[0][0].ToString())
{
contentsKey = dsData.Tables[0].Rows[0][0].ToString();
String sUrl = String.Empty;
sUrl = sContentsUrl + contentsKey;
surftheWeb(sUrl);
}
}
catch (Exception ex)
{
throw ex;
}
finally
{
if (oConnection.State == ConnectionState.Open)
{
oConnection.Close();
}
}
}
/// <summary>
/// key값으로 contents내용을 업데이트 한다.
/// </summary>
/// <param name="sHelp_topic_id"></param>
/// <param name="sDescription"></param>
private void updateContentonIldaro(String sHelp_topic_id, String sDescription)
{
MySqlConnection oConnection = new MySqlConnection(sConnectionString);
oConnection.Open();
try
{
MySqlCommand cmd = oConnection.CreateCommand();
cmd.CommandText = "UPDATE tmp_help set description = @description where help_topic_id = @help_topic_id";
cmd.Parameters.AddWithValue("@help_topic_id", int.Parse(sHelp_topic_id));
cmd.Parameters.AddWithValue("@description", sDescription);
cmd.ExecuteNonQuery();
}
catch (Exception ex)
{
throw ex;
}
finally
{
if (oConnection.State == ConnectionState.Open)
{
oConnection.Close();
//key를 다시 조회한다.
btnSearch_Click(null, null);
}
}
}
#endregion
/// <summary>
/// WebBrowser's DocumentCompleted Event
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void webBrowserViewer_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
WebBrowser oBrowser = sender as WebBrowser;
//txtSource.Text = StreamConvertEUCKRtoUTF8(oBrowser);
this.txtUrl.Text = oBrowser.Url.AbsoluteUri;
//webSiteContentsList();
webSiteContentsDetail(oBrowser);
}
private void webSiteContentsDetail(WebBrowser oBrowser)
{
iq++; //반복수행위한 카운팅
getWebSiteContentsDetail(oBrowser);
}
/// <summary>
/// 게시판 글list, Paging있음
/// </summary>
private void webSiteContentsList() {
iq++; //paging 숫자
getData();
if (iq < 21)
{
String sUrl = sListUrl + iq.ToString();
surftheWeb(sUrl);
}
}
/// <summary>
/// 1.WebBrowser로 rendering된 html을 TagName으로 HtmlElementCollection에 담는다.
/// 2.HtmlElementCollection => HtmlElement에 담긴 InnerHtml분석한다.
/// 3.HtmlElement[]에 추출하려는 대상을 담는다.
/// </summary>
/// <param name="wb"></param>
/// <param name="tagName"></param>
/// <returns></returns>
private HtmlElement[] GetElementsByTagName(WebBrowser wb, string tagName)
{
var l = new List<HtmlElement>();
var els = wb.Document.GetElementsByTagName(tagName); // all elems with tag
foreach (HtmlElement el in els)
{
if (el.InnerHtml == null)
{
continue;
}
if(el.InnerHtml.StartsWith("<A href=\"read.html"))
{
l.Add(el);
}
}
return l.ToArray();
}
/// <summary>
/// 1.WebBrowser로 rendering된 html을 TagName으로 HtmlElementCollection에 담는다.
/// 2.HtmlElementCollection => HtmlElement에 담긴 요소들중 className을 이용해서 추출하려는 대상을 뽑는다..
/// 3.HtmlElement[]에 추출하려는 대상을 담는다.
/// </summary>
/// <param name="wb"></param>
/// <param name="tagName"></param>
/// <param name="className"></param>
/// <returns></returns>
private HtmlElement[] GetElementsByTagNClassName(WebBrowser wb, string tagName, string className)
{
var l = new List<HtmlElement>();
var els = wb.Document.GetElementsByTagName(tagName); // all elems with tag
foreach (HtmlElement el in els)
{
if (el.GetAttribute("className") == className)
{
l.Add(el);
}
}
return l.ToArray();
}
/// <summary>
/// 글 상세 페이지에서 추출하고자 하는 Text만 가져온다.
/// </summary>
/// <param name="oBrowser"></param>
private void getWebSiteContentsDetail(WebBrowser oBrowser)
{
var arrCollection = GetElementsByTagNClassName(webBrowserViewer, "td", "contentsbody");
for (int i = 0; i < arrCollection.Length; i++)
{
//Console.WriteLine(arrCollection[i].InnerHtml);
String sText = String.Empty;
//sText = arrCollection[i].InnerHtml;
sText = arrCollection[i].OuterText;
if (sText == null)
{
//글 text가 없는 경우가 있음
//case1) 이미지로 대체한 경우 img html tag가 글 대신 있음
sText = arrCollection[i].InnerHtml;
}
if (contentsKey.Equals(oBrowser.Url.AbsoluteUri.Replace(sContentsUrl, "")))
{
updateContentonIldaro(contentsKey, sText);
}
}
}
private void getData() {
// getting day and night temperature
var arrCollection = GetElementsByTagName(webBrowserViewer, "dt");
DataTable dt = new DataTable();
dt.Columns.Add("contentsKey", typeof(int));
dt.Columns.Add("Title", typeof(string));
Console.WriteLine("-- " + iq.ToString() + " --------------");
txtSource.Text = txtSource.Text + "-- " + iq.ToString() + " --------------" + "\r\n";
for (int i = 0; i < arrCollection.Length; i++) {
//Console.WriteLine(arrCollection[i].InnerHtml);
String sText = String.Empty;
sText = arrCollection[i].InnerHtml;
sText = sText.Replace("<A href=\"sread.html?key=", "").Replace("</A>", "").Replace("\"", "");
String[] arrSplit = sText.Split('>');
dt.Rows.Add(arrSplit[0],arrSplit[1]);
Console.WriteLine(arrSplit[0] + ":" + arrSplit[1]);
txtSource.Text = txtSource.Text + arrSplit[0] + ":" + arrSplit[1] + "\r\n";
}
}
#endregion